diff options
Diffstat (limited to 'gcc/config/i386')
149 files changed, 17746 insertions, 1074 deletions
diff --git a/gcc/config/i386/adxintrin.h b/gcc/config/i386/adxintrin.h index 5c0ea9fea96..611890044c4 100644 --- a/gcc/config/i386/adxintrin.h +++ b/gcc/config/i386/adxintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/ammintrin.h b/gcc/config/i386/ammintrin.h index 297b98dd0d8..a89b2046d8f 100644 --- a/gcc/config/i386/ammintrin.h +++ b/gcc/config/i386/ammintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md index 8bbde33dc1c..b207d882feb 100644 --- a/gcc/config/i386/athlon.md +++ b/gcc/config/i386/athlon.md @@ -1,4 +1,4 @@ -;; Copyright (C) 2002-2013 Free Software Foundation, Inc. +;; Copyright (C) 2002-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/atom.md b/gcc/config/i386/atom.md index 29991fda5f4..7102df12314 100644 --- a/gcc/config/i386/atom.md +++ b/gcc/config/i386/atom.md @@ -1,5 +1,5 @@ ;; Atom Scheduling -;; Copyright (C) 2009-2013 Free Software Foundation, Inc. +;; Copyright (C) 2009-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/att.h b/gcc/config/i386/att.h index e194c5bf401..f559a83bb6a 100644 --- a/gcc/config/i386/att.h +++ b/gcc/config/i386/att.h @@ -1,5 +1,5 @@ /* Definitions for AT&T assembler syntax for the Intel 80386. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 4030dfe2bc2..33b12e10a3a 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Free Software Foundation, Inc. +/* Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/avx512cdintrin.h b/gcc/config/i386/avx512cdintrin.h new file mode 100644 index 00000000000..3935b773456 --- /dev/null +++ b/gcc/config/i386/avx512cdintrin.h @@ -0,0 +1,218 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512CDINTRIN_H_INCLUDED +#define _AVX512CDINTRIN_H_INCLUDED + +#ifndef __AVX512CD__ +#pragma GCC push_options +#pragma GCC target("avx512cd") +#define __DISABLE_AVX512CD__ +#endif /* __AVX512CD__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_broadcastmb512 (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_broadcastmw512 (__A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, __U); +} + +#ifdef __DISABLE_AVX512CD__ +#undef __DISABLE_AVX512CD__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512CD__ */ + +#endif /* _AVX512CDINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx512erintrin.h b/gcc/config/i386/avx512erintrin.h new file mode 100644 index 00000000000..f442f2bec94 --- /dev/null +++ b/gcc/config/i386/avx512erintrin.h @@ -0,0 +1,332 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512ERINTRIN_H_INCLUDED +#define _AVX512ERINTRIN_H_INCLUDED + +#ifndef __AVX512ER__ +#pragma GCC push_options +#pragma GCC target("avx512er") +#define __DISABLE_AVX512ER__ +#endif /* __AVX512ER__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_pd (__m512d __A, int __R) +{ + __m512d __W; + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_ps (__m512 __A, int __R) +{ + __m512 __W; + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_pd (__m512d __A, int __R) +{ + __m512d __W; + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_ps (__m512 __A, int __R) +{ + __m512 __W; + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_pd (__m512d __A, int __R) +{ + __m512d __W; + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_ps (__m512 __A, int __R) +{ + __m512 __W; + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +#else +#define _mm512_exp2a23_round_pd(A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \ + __builtin_ia32_exp2pd_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_pd(U, A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_exp2a23_round_ps(A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \ + __builtin_ia32_exp2ps_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_ps(U, A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rcp28_round_pd(A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rcp28_round_pd(W, U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_pd(U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rcp28_round_ps(A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rcp28_round_ps(W, U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_ps(U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rsqrt28_round_pd(A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rsqrt28_round_ps(A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) +#endif + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(W, U, A) \ + _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(U, A) \ + _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(W, U, A) \ + _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(U, A) \ + _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_pd(A) \ + _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(W, U, A) \ + _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(U, A) \ + _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_ps(A) \ + _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(W, U, A) \ + _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(U, A) \ + _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(W, U, A) \ + _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(U, A) \ + _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(W, U, A) \ + _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(U, A) \ + _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#ifdef __DISABLE_AVX512ER__ +#undef __DISABLE_AVX512ER__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512ER__ */ + +#endif /* _AVX512ERINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h new file mode 100644 index 00000000000..a2ee88ea2a3 --- /dev/null +++ b/gcc/config/i386/avx512fintrin.h @@ -0,0 +1,12689 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512FINTRIN_H_INCLUDED +#define _AVX512FINTRIN_H_INCLUDED + +#ifndef __AVX512F__ +#pragma GCC push_options +#pragma GCC target("avx512f") +#define __DISABLE_AVX512F__ +#endif /* __AVX512F__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x05 + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi64 (long long __A, long long __B, long long __C, + long long __D, long long __E, long long __F, + long long __G, long long __H) +{ + return __extension__ (__m512i) (__v8di) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H I J K L M N O P]. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H, + int __I, int __J, int __K, int __L, + int __M, int __N, int __O, int __P) +{ + return __extension__ (__m512i)(__v16si) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_pd (double __A, double __B, double __C, double __D, + double __E, double __F, double __G, double __H) +{ + return __extension__ (__m512d) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H, + float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) +{ + return __extension__ (__m512) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ + e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_ps (void) +{ + return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_pd (void) +{ + return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_si512 (void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_pd (void const *__P) +{ + return *(__m512d *) __P; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_pd (void *__P, __m512d __A) +{ + *(__m512d *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_ps (void const *__P) +{ + return *(__m512 *) __P; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_ps (void *__P, __m512 __A) +{ + *(__m512 *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi64 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi64 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_si512 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi32 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_si512 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi32 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epu32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_slli_epi64(X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)-1)) + +#define _mm512_mask_slli_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_slli_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_srli_epi64(X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)-1)) + +#define _mm512_mask_srli_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_srli_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_srai_epi64(X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)-1)) + +#define _mm512_mask_srai_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_srai_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_slli_epi32(X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_slli_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_slli_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_srli_epi32(X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_srli_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_srli_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_srai_epi32(X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_srai_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_srai_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +#else +#define _mm_add_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_addsd_round(A, B, C) + +#define _mm_add_round_ss(A, B, C) \ + (__m128)__builtin_ia32_addss_round(A, B, C) + +#define _mm_sub_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_subsd_round(A, B, C) + +#define _mm_sub_round_ss(A, B, C) \ + (__m128)__builtin_ia32_subss_round(A, B, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, imm, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B, + __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, imm, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + imm, (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + imm, (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + imm, (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, const int imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + imm, (__mmask16) __U); +} +#else +#define _mm512_ternarylogic_epi64(A, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1)) +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) +#define _mm512_ternarylogic_epi32(A, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ + (__mmask16)-1)) +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ + (__mmask16)(U))) +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ + (__mmask16)(U))) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __A, + (__v2df) __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __A, + (__v4sf) __B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __A, + (__v2df) __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __A, + (__v4sf) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_round ((__v2df) __B, + (__v2df) __A, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_round ((__v4sf) __B, + (__v4sf) __A, + __R); +} +#else +#define _mm512_sqrt_round_pd(A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_sqrt_round_pd(W, U, A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_pd(U, A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sqrt_round_ps(A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_sqrt_round_ps(W, U, A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_ps(U, A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_sqrt_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_round(A, B, C) + +#define _mm_sqrt_round_ss(A, B, C) \ + (__m128)__builtin_ia32_sqrtss_round(A, B, C) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +#else +#define _mm512_add_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_add_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_add_round_ps(A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_add_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_sub_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_sub_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sub_round_ps(A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_sub_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +#else +#define _mm512_mul_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_mul_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_mul_round_ps(A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_mul_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_div_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_div_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_div_round_ps(A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_div_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_mul_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_mulsd_round(A, B, C) + +#define _mm_mul_round_ss(A, B, C) \ + (__m128)__builtin_ia32_mulss_round(A, B, C) + +#define _mm_div_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_divsd_round(A, B, C) + +#define _mm_div_round_ss(A, B, C) \ + (__m128)__builtin_ia32_divss_round(A, B, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +#else +#define _mm512_max_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, R) + +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_max_round_ps(A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_pd(), -1, R) + +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) + +#define _mm512_min_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, R) + +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_min_round_ps(A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, R) + +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} +#else +#define _mm512_scalef_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_scalef_round_ps(A, B, C) \ + (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C) + +#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_scalef_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_scalefsd_round(A, B, C) + +#define _mm_scalef_round_ss(A, B, C) \ + (__m128)__builtin_ia32_scalefss_round(A, B, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} +#else +#define _mm512_fmadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) + +#define _mm512_fmadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) + +#define _mm512_fmsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, -(C), U, R) + +#define _mm512_fmsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, -(C), U, R) + +#define _mm512_fmaddsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fmaddsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) + +#define _mm512_fmsubadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) + +#define _mm512_fmsubadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) + +#define _mm512_fnmadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, C, -1, R) + +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfnmaddpd512_mask(-(A), B, C, U, R) + +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(A), B, C, U, R) + +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, C, U, R) + +#define _mm512_fnmadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, C, -1, R) + +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfnmaddps512_mask(-(A), B, C, U, R) + +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask3(-(A), B, C, U, R) + +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, C, U, R) + +#define _mm512_fnmsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, -(C), -1, R) + +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, -(C), U, R) + +#define _mm512_fnmsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, -(C), -1, R) + +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, -(C), U, R) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastss_ps (__m128 __A) +{ + __v16sf __O; + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, __O, + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) __O, __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastsd_pd (__m128d __A) +{ + __v8df __O; + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, __O, + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) __O, __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastd_epi32 (__m128i __A) +{ + __v16si __O; + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi32 (int __A) +{ + __v16si __O; + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, __O, + (__mmask16)(-1)); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi32 (__mmask16 __M, int __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastd512_gpr_mask (__A, + (__v16si) _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastq_epi64 (__m128i __A) +{ + __v8di __O; + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi64 (long long __A) +{ + __v8di __O; +#ifdef TARGET_64BIT + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, __O, + (__mmask8)(-1)); +#else + return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A, __O, + (__mmask8)(-1)); +#endif +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) +{ +#ifdef TARGET_64BIT + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, + __M); +#else + return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A, (__v8di) __O, + __M); +#endif +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ +#ifdef TARGET_64BIT + return (__m512i) + __builtin_ia32_pbroadcastq512_gpr_mask (__A, + (__v8di) _mm512_setzero_si512 (), + __M); +#else + return (__m512i) + __builtin_ia32_pbroadcastq512_mem_mask (__A, + (__v8di) _mm512_setzero_si512 (), + __M); +#endif +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x4 (__m128 __A) +{ + __v16sf __O; + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, __O, + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) __O, + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x4 (__m128i __A) +{ + __v16si __O; + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + __O, + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x4 (__m256d __A) +{ + __v8df __O; + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + __O, + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) __O, + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i64x4 (__m256i __A) +{ + __v8di __O; + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + __O, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +typedef enum +{ + _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, + _MM_PERM_DDDD = 0xFF +} _MM_PERM_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#else +#define _mm512_shuffle_epi32(X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_shuffle_i64x2(X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) + +#define _mm512_shuffle_i32x4(X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_shuffle_f64x2(X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_shuffle_f32x4(X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} +#else +#define _mm512_cvtt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) + +#define _mm512_cvtt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} +#else +#define _mm512_cvt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) + +#define _mm512_cvt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +#else +#define _mm512_cvtt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), -1, B)) + +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) + +#define _mm512_cvtt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), -1, B)) + +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +#else +#define _mm512_cvt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), -1, B)) + +#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) + +#define _mm512_cvt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), -1, B)) + +#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sd (__m128d __A, unsigned __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) + +#define _mm_cvt_roundi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) + +#define _mm_cvt_roundsi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) +#endif + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) + +#define _mm_cvt_roundi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) + +#define _mm_cvt_roundsi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) + +#define _mm_cvt_roundi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) + +#define _mm_cvt_roundsi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) +#endif + +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi16 (__m512i __A) +{ + __v16hi __O; + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi16 (__m512i __A) +{ + __v16hi __O; + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi16 (__m512i __A) +{ + __v16hi __O; + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, __O, + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi32 (__m512i __A) +{ + __v8si __O; + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi32 (__m512i __A) +{ + __v8si __O; + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi32 (__m512i __A) +{ + __v8si __O; + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi16 (__m512i __A) +{ + __v8hi __O; + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi16 (__m512i __A) +{ + __v8hi __O; + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi16 (__m512i __A) +{ + __v8hi __O; + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi8 (__m512i __A) +{ + __v16qi __O; + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, __O, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +#else +#define _mm512_cvt_roundepi32_ps(A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), -1, B) + +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundepu32_ps(A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), -1, B) + +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x4_pd (__m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x4_ps (__m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x4_epi64 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x4_epi32 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +#else + +#define _mm512_extractf64x4_pd(X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)_mm256_setzero_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x4_pd(W, U, X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extractf64x4_pd(U, X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)_mm256_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_extractf32x4_ps(X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)_mm_setzero_ps(),\ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x4_ps(W, U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extractf32x4_ps(U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)_mm_setzero_ps(),\ + (__mmask8)(U))) + +#define _mm512_extracti64x4_epi64(X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)-1)) + +#define _mm512_mask_extracti64x4_epi64(W, U, X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extracti64x4_epi64(U, X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm512_extracti32x4_epi32(X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)-1)) + +#define _mm512_mask_extracti32x4_epi32(W, U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extracti32x4_epi32(U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A, + (__v4si) __B, + __imm, + (__v16si) __A, -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A, + (__v4sf) __B, + __imm, + (__v16sf) __A, -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A, + __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +#else +#define _mm512_insertf32x4(X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1))) + +#define _mm512_inserti32x4(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1))) + +#define _mm512_insertf64x4(X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_insertf64x4(W, U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_insertf64x4(U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_inserti64x4(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), \ + (__mmask8)-1)) + +#define _mm512_mask_inserti64x4(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_inserti64x4(U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), \ + (__mmask8)(U))) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_pd (void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_pd (void *__P, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_ps (void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ps (void *__P, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_storedqudi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_si512 (void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_si512 (void *__P, __m512i __A) +{ + __builtin_ia32_storedqusi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_storedqusi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_pd (__m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_ps (__m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, + __mmask8 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, + (__v8di) __I + /* idx */ , + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, + __mmask16 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, + (__v16si) __I + /* idx */ , + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, + (__v8di) __I + /* idx */ , + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, + (__v16si) __I + /* idx */ , + (__v16sf) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_pd (__m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_ps (__m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +#else +#define _mm512_permute_pd(X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)(X), \ + (__mmask8)(-1))) + +#define _mm512_mask_permute_pd(W, U, X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_permute_pd(U, X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_permute_ps(X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)(X), \ + (__mmask16)(-1))) + +#define _mm512_mask_permute_ps(W, U, X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U))) + +#define _mm512_maskz_permute_ps(U, X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_epi64 (__m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) (-1)); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M, + __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) __W, + (__mmask8) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_pd (__m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +#else +#define _mm512_permutex_pd(X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)(X), (__mmask8)-1)) + +#define _mm512_mask_permutex_pd(W, U, X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_permutex_pd(U, X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_permutex_epi64(X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i)(X), \ + (__mmask8)(-1))) + +#define _mm512_maskz_permutex_epi64(M, X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i) \ + (_mm512_setzero_si512 ()),\ + (__mmask8)(M))) + +#define _mm512_mask_permutex_epi64(W, M, X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(M))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_pd (__m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_ps (__m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M, + __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +#else +#define _mm512_shuffle_pd(X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_pd(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_pd(U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_shuffle_ps(X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_ps(W, U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_ps(U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)(U))) + +#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(-1), (R))) + +#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), (R))) + +#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), (R))) + +#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) +#endif + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movehdup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_moveldup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi32 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi32 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi64 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi64 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#else +#define _mm512_rol_epi32(A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(-1))) +#define _mm512_mask_rol_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U))) +#define _mm512_maskz_rol_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(U))) +#define _mm512_ror_epi32(A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(-1))) +#define _mm512_mask_ror_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U))) +#define _mm512_maskz_ror_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(U))) +#define _mm512_rol_epi64(A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(-1))) +#define _mm512_mask_rol_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U))) +#define _mm512_maskz_rol_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(U))) + +#define _mm512_ror_epi64(A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(-1))) +#define _mm512_mask_ror_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U))) +#define _mm512_maskz_ror_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) + +#define _mm_cvt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvtt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) + +#define _mm_cvtt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) + +#define _mm_cvtt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) + +#define _mm_cvt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvtt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) + +#define _mm_cvtt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) + +#define _mm_cvtt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} +#else +#define _mm_cvt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) + +#define _mm_cvt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvtt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) + +#define _mm_cvtt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) + +#define _mm_cvtt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} +#else +#define _mm_cvt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) + +#define _mm_cvt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvtt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) + +#define _mm_cvtt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) + +#define _mm_cvtt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movedup_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_pd (__m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_ps (__m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A, + const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A, + const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); +} +#else +#define _mm512_cvt_roundps_pd(A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), -1, B) + +#define _mm512_mask_cvt_roundps_pd(W, U, A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) + +#define _mm512_maskz_cvt_roundps_pd(U, A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) + +#define _mm512_cvt_roundph_ps(A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), -1, B) + +#define _mm512_mask_cvt_roundph_ps(W, U, A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) + +#define _mm512_maskz_cvt_roundph_ps(U, A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundps_ph(A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), -1)) +#define _mm512_cvtps_ph(A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), -1)) +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_mask_cvtps_ph(U, W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) +#define _mm512_maskz_cvtps_ph(W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_ps (__m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, + (__v4sf) __B, + __R); +} +#else +#define _mm512_cvt_roundpd_ps(A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), -1, B) + +#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) + +#define _mm512_maskz_cvt_roundpd_ps(U, A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) + +#define _mm_cvt_roundsd_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) + +#define _mm_cvt_roundss_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) +#endif + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_si512 (__m512i * __P, __m512i __A) +{ + __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_ps (float *__P, __m512 __A) +{ + __builtin_ia32_movntps512 (__P, (__v16sf) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_pd (double *__P, __m512d __A) +{ + __builtin_ia32_movntpd512 (__P, (__v8df) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +/* Constants for mantissa extraction */ +typedef enum +{ + _MM_MANT_NORM_1_2, /* interval [1, 2) */ + _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ + _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ + _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ +} _MM_MANTISSA_NORM_ENUM; + +typedef enum +{ + _MM_MANT_SIGN_src, /* sign = sign(SRC) */ + _MM_MANT_SIGN_zero, /* sign = 0 */ + _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ +} _MM_MANTISSA_SIGN_ENUM; + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sd (__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_ss (__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + __R); +} + +#else +#define _mm512_getmant_round_pd(X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)-1,\ + (R))) + +#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U),\ + (R))) +#define _mm512_getmant_round_ps(X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)-1,\ + (R))) + +#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U),\ + (R))) + +#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U),\ + (R))) +#define _mm_getmant_round_sd(X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_getmant_round_ss(X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_getexp_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) + +#define _mm_getexp_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) + +#define _mm512_getexp_round_ps(A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, R)) + +#define _mm512_mask_getexp_round_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), (__mmask16)(U), R)) + +#define _mm512_maskz_getexp_round_ps(U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) + +#define _mm512_getexp_round_pd(A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)-1, R)) + +#define _mm512_mask_getexp_round_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), (__mmask8)(U), R)) + +#define _mm512_maskz_getexp_round_pd(U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) __A, -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) __A, -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B, + __m512d __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A, + (__v4sf) __B, __imm, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, + const int __R) +{ + return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A, + (__v2df) __B, __imm, __R); +} + +#else +#define _mm512_roundscale_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ + (__v16sf)(__m512)(A), (__mmask16)(-1), R)) +#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + (int)(D), \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), R)) +#define _mm512_maskz_roundscale_round_ps(A, B, C, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + (int)(C), \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), R)) +#define _mm512_roundscale_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\ + (__v8df)(__m512d)(A), (__mmask8)(-1), R)) +#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + (int)(D), \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), R)) +#define _mm512_maskz_roundscale_round_pd(A, B, C, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + (int)(C), \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), R)) +#define _mm_roundscale_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), R)) +#define _mm_roundscale_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), R)) +#endif + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_floor_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) + _mm512_setzero_ps (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_floor_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) + _mm512_setzero_pd (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ceil_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) + _mm512_setzero_ps (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ceil_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) + _mm512_setzero_pd (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __A, -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __A, -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __A, -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __A, -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __W, __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __W, __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __W, __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __W, __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_floor_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_floor_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ceil_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ceil_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_floor_round_ps(A, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), \ + _MM_FROUND_FLOOR, \ + (__v16sf)(__m512)(A), \ + (__mmask16)(-1), R)) +#define _mm512_mask_floor_round_ps(A, B, C, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + _MM_FROUND_FLOOR, \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), R)) +#define _mm512_maskz_floor_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + _MM_FROUND_FLOOR, \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), R)) +#define _mm512_floor_round_pd(A, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), \ + _MM_FROUND_FLOOR, \ + (__v8df)(__m512d)(A), \ + (__mmask8)(-1), R)) +#define _mm512_mask_floor_round_pd(A, B, C, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + _MM_FROUND_FLOOR, \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), R)) +#define _mm512_maskz_floor_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + _MM_FROUND_FLOOR, \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), R)) +#define _mm512_ceil_round_ps(A, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), \ + _MM_FROUND_CEIL, \ + (__v16sf)(__m512)(A), \ + (__mmask16)(-1), R)) +#define _mm512_mask_ceil_round_ps(A, B, C, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + _MM_FROUND_CEIL, \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), R)) +#define _mm512_maskz_ceil_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + _MM_FROUND_CEIL, \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), R)) +#define _mm512_ceil_round_pd(A, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), \ + _MM_FROUND_CEIL, \ + (__v8df)(__m512d)(A), \ + (__mmask8)(-1), R)) +#define _mm512_mask_ceil_round_pd(A, B, C, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + _MM_FROUND_CEIL, \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), R)) +#define _mm512_maskz_ceil_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + _MM_FROUND_CEIL, \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), R)) + +#define _mm512_alignr_epi32(X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(X), \ + (__mmask16)-1)) + +#define _mm512_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), \ + (__mmask16)(U))) + +#define _mm512_maskz_alignr_epi32(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_alignr_epi64(X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(X), (__mmask8)-1)) + +#define _mm512_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_alignr_epi64(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +#define _MM_CMPINT_EQ 0x0 +#define _MM_CMPINT_LT 0x1 +#define _MM_CMPINT_LE 0x2 +#define _MM_CMPINT_UNUSED 0x3 +#define _MM_CMPINT_NE 0x4 +#define _MM_CMPINT_NLT 0x5 +#define _MM_CMPINT_GE 0x5 +#define _MM_CMPINT_NLE 0x6 +#define _MM_CMPINT_GT 0x6 + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P, + const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, __R); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, + const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, __R); +} + +#else +#define _mm512_cmp_epi64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm512_cmp_epi32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P),\ + (__mmask16)-1)) + +#define _mm512_cmp_epu64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm512_cmp_epu32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P),\ + (__mmask16)-1)) + +#define _mm512_cmp_round_pd_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)-1, R)) + +#define _mm512_cmp_round_ps_mask(X, Y, P, R) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)-1, R)) + +#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)M)) + +#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P),\ + (__mmask16)M)) + +#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)M)) + +#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P),\ + (__mmask16)M)) + +#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)M, R)) + +#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)M, R)) + +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (M), R)) + +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (M), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_ps (__m512i __index, float const *__addr, int __scale) +{ + __m512 v1_old = _mm512_setzero_ps (); + __mmask16 mask = 0xFFFF; + + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) v1_old, + __addr, + (__v16si) __index, + mask, __scale); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_ps (__m512 v1_old, __mmask16 __mask, + __m512i __index, float const *__addr, int __scale) +{ + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_pd (__m256i __index, double const *__addr, int __scale) +{ + __m512d v1_old = _mm512_setzero_pd (); + __mmask8 mask = 0xFF; + + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) v1_old, + __addr, + (__v8si) __index, mask, + __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask, + __m256i __index, double const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_ps (__m512i __index, float const *__addr, int __scale) +{ + __m256 v1_old = _mm256_setzero_ps (); + __mmask8 mask = 0xFF; + + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) v1_old, + __addr, + (__v8di) __index, mask, + __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask, + __m512i __index, float const *__addr, int __scale) +{ + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_pd (__m512i __index, double const *__addr, int __scale) +{ + __m512d v1_old = _mm512_setzero_pd (); + __mmask8 mask = 0xFF; + + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) v1_old, + __addr, + (__v8di) __index, mask, + __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask, + __m512i __index, double const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi32 (__m512i __index, int const *__addr, int __scale) +{ + __m512i v1_old = _mm512_setzero_si512 (); + __mmask16 mask = 0xFFFF; + + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) v1_old, + __addr, + (__v16si) __index, + mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask, + __m512i __index, int const *__addr, int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi64 (__m256i __index, long long const *__addr, int __scale) +{ + __m512i v1_old = _mm512_setzero_si512 (); + __mmask8 mask = 0xFF; + + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) v1_old, + __addr, + (__v8si) __index, mask, + __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m256i __index, long long const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi32 (__m512i __index, int const *__addr, int __scale) +{ + __m256i v1_old = _mm256_setzero_si256 (); + __mmask8 mask = 0xFF; + + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) v1_old, + __addr, + (__v8di) __index, + mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m512i __index, int const *__addr, int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi64 (__m512i __index, long long const *__addr, int __scale) +{ + __m512i v1_old = _mm512_setzero_si512 (); + __mmask8 mask = 0xFF; + + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) v1_old, + __addr, + (__v8di) __index, mask, + __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m512i __index, long long const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_ps (float *__addr, __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_ps (float *__addr, __mmask16 __mask, + __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index, + (__v16sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_pd (double *__addr, __m256i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_pd (double *__addr, __mmask8 __mask, + __m256i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index, + (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_ps (float *__addr, __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_ps (float *__addr, __mmask8 __mask, + __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index, + (__v8sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_pd (double *__addr, __m512i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_pd (double *__addr, __mmask8 __mask, + __m512i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index, + (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi32 (int *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi32 (int *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index, + (__v16si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi64 (long long *__addr, __m256i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi64 (long long *__addr, __mmask8 __mask, + __m256i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index, + (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi32 (int *__addr, __m512i __index, + __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi32 (int *__addr, __mmask8 __mask, + __m512i __index, __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index, + (__v8si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi64 (long long *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi64 (long long *__addr, __mmask8 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index, + (__v8di) __v1, __scale); +} +#else +#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_setzero_ps(), \ + (float const *)ADDR, \ + (__v16si)(__m512i)INDEX, \ + (__mmask16)0xFFFF, (int)SCALE) + +#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512)V1OLD, \ + (float const *)ADDR, \ + (__v16si)(__m512i)INDEX, \ + (__mmask16)MASK, (int)SCALE) + +#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_setzero_pd(), \ + (double const *)ADDR, \ + (__v8si)(__m256i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d)V1OLD, \ + (double const *)ADDR, \ + (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_setzero_ps(), \ + (float const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256)V1OLD, \ + (float const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_setzero_pd(), \ + (double const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d)V1OLD, \ + (double const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_setzero_si512 (), \ + (int const *)ADDR, \ + (__v16si)(__m512i)INDEX, \ + (__mmask16)0xFFFF, (int)SCALE) + +#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i)V1OLD, \ + (int const *)ADDR, \ + (__v16si)(__m512i)INDEX, \ + (__mmask16)MASK, (int)SCALE) + +#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_setzero_si512 (), \ + (long long const *)ADDR, \ + (__v8si)(__m256i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i)V1OLD, \ + (long long const *)ADDR, \ + (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_setzero_si256(), \ + (int const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i)V1OLD, \ + (int const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_setzero_si512 (), \ + (long long const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i)V1OLD, \ + (long long const *)ADDR, \ + (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf ((float *)ADDR, (__mmask16)0xFFFF, \ + (__v16si)(__m512i)INDEX, \ + (__v16sf)(__m512)V1, (int)SCALE) + +#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf ((float *)ADDR, (__mmask16)MASK, \ + (__v16si)(__m512i)INDEX, \ + (__v16sf)(__m512)V1, (int)SCALE) + +#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df ((double *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, \ + (__v8df)(__m512d)V1, (int)SCALE) + +#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df ((double *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, \ + (__v8df)(__m512d)V1, (int)SCALE) + +#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf ((float *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, \ + (__v8sf)(__m256)V1, (int)SCALE) + +#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf ((float *)ADDR, (__mmask16)MASK, \ + (__v8di)(__m512i)INDEX, \ + (__v8sf)(__m256)V1, (int)SCALE) + +#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df ((double *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, \ + (__v8df)(__m512d)V1, (int)SCALE) + +#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df ((double *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, \ + (__v8df)(__m512d)V1, (int)SCALE) + +#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si ((int *)ADDR, (__mmask16)0xFFFF, \ + (__v16si)(__m512i)INDEX, \ + (__v16si)(__m512i)V1, (int)SCALE) + +#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si ((int *)ADDR, (__mmask16)MASK, \ + (__v16si)(__m512i)INDEX, \ + (__v16si)(__m512i)V1, (int)SCALE) + +#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di ((long long *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, \ + (__v8di)(__m512i)V1, (int)SCALE) + +#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di ((long long *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, \ + (__v8di)(__m512i)V1, (int)SCALE) + +#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si ((int *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, \ + (__v8si)(__m256i)V1, (int)SCALE) + +#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si ((int *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, \ + (__v8si)(__m256i)V1, (int)SCALE) + +#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di ((long long *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, \ + (__v8di)(__m512i)V1, (int)SCALE) + +#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di ((long long *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, \ + (__v8di)(__m512i)V1, (int)SCALE) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) + __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 + (), (__mmask16) __U); +} + +/* Mask arithmetic operations */ +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kand (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kandn (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestz (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestc (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxnor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_knot (__mmask16 __A) +{ + return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackb (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) + _mm512_setzero_si512 (), + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) + _mm512_setzero_ps (), __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) __A, + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C, + __m128 __D, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) __A, __B); +} +#else +#define _mm512_maskz_insertf32x4(A, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), \ + (__mmask8)(A))) + +#define _mm512_maskz_inserti32x4(A, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), \ + (__mmask8)(A))) + +#define _mm512_mask_insertf32x4(A, B, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), \ + (__mmask8)(B))) + +#define _mm512_mask_inserti32x4(A, B, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), \ + (__mmask8)(B))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +#else +#define _mm_max_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_addsd_round(A, B, C) + +#define _mm_max_round_ss(A, B, C) \ + (__m128)__builtin_ia32_addss_round(A, B, C) + +#define _mm_min_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_subsd_round(A, B, C) + +#define _mm_min_round_ss(A, B, C) \ + (__m128)__builtin_ia32_subss_round(A, B, C) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) +{ + return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W) +{ + return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + __R); +} +#else +#define _mm_fmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) + +#define _mm_fmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) + +#define _mm_fmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) + +#define _mm_fmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) + +#define _mm_fnmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) + +#define _mm_fnmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) + +#define _mm_fnmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) + +#define _mm_fnmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); +} +#else +#define _mm_comi_round_ss(A, B, C, D)\ +__builtin_ia32_vcomiss(A, B, C, D) +#define _mm_comi_round_sd(A, B, C, D)\ +__builtin_ia32_vcomisd(A, B, C, D) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_pd (__m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_round ((__v2df) __A, + (__v2df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_round ((__v4sf) __A, + (__v4sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_ss (__m128 __A, unsigned __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +#else +#define _mm512_fixupimm_pd(X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_fixupimm_ps(X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_sd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_ss(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_pd (__m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_ps (__m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_ps (__m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_getmant_pd(X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)-1,\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_pd(W, U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_pd(U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getmant_ps(X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)-1,\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ps(W, U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ps(U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U),\ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_getmant_sd(X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_ss(X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_ss(A, B) \ + ((__m128)__builtin_ia32_getexpss128_mask((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_sd(A, B) \ + ((__m128d)__builtin_ia32_getexpsd128_mask((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getexp_ps(A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_ps(W, U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_ps(U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getexp_pd(A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_pd(W, U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_pd(U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_ps (__m512 __A, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_pd (__m512d __A, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C, + const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A, + (__v4sf) __B, __imm, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A, + (__v2df) __B, __imm, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_roundscale_ps(A, B) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ + (__v16sf)(__m512)(A), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_ps(A, B, C, D) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + (int)(D), \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_ps(A, B, C) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + (int)(C), \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_pd(A, B) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\ + (__v8df)(__m512d)(A), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_pd(A, B, C, D) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + (int)(D), \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_pd(A, B, C) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + (int)(C), \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_ss(A, B, C) \ + ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_sd(A, B, C) \ + ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_cmp_pd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm512_cmp_ps_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)M, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)M,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_sd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + M,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_ss_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + M,_MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __DISABLE_AVX512F__ +#undef __DISABLE_AVX512F__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512F__ */ + +#endif /* _AVX512FINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx512pfintrin.h b/gcc/config/i386/avx512pfintrin.h new file mode 100644 index 00000000000..b8c011032c6 --- /dev/null +++ b/gcc/config/i386/avx512pfintrin.h @@ -0,0 +1,129 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512PFINTRIN_H_INCLUDED +#define _AVX512PFINTRIN_H_INCLUDED + +#ifndef __AVX512PF__ +#pragma GCC push_options +#pragma GCC target("avx512pf") +#define __DISABLE_AVX512PF__ +#endif /* __AVX512PF__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32gather_ps (__m512i index, __mmask16 mask, + int const *addr, int scale, int hint) +{ + __builtin_ia32_gatherpfdps (mask, (__v16si) index, addr, scale, hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64gather_ps (__m512i index, __mmask8 mask, + int const *addr, int scale, int hint) +{ + __builtin_ia32_gatherpfqps (mask, (__v8di) index, addr, scale, hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32scatter_ps (int const *addr, __m512i index, int scale, + int hint) +{ + __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) index, addr, scale, + hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32scatter_ps (int const *addr, __mmask16 mask, + __m512i index, int scale, int hint) +{ + __builtin_ia32_scatterpfdps (mask, (__v16si) index, addr, scale, hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64scatter_ps (int const *addr, __m512i index, int scale, + int hint) +{ + __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) index, addr, scale, + hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64scatter_ps (int const *addr, __mmask16 mask, + __m512i index, int scale, int hint) +{ + __builtin_ia32_scatterpfqps (mask, (__v8di) index, addr, scale, hint); +} +#else +#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (int const *)ADDR, (int)SCALE, (int)HINT) +#endif + +#ifdef __DISABLE_AVX512PF__ +#undef __DISABLE_AVX512PF__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512PF__ */ + +#endif /* _AVX512PFINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h index 7f2109a7299..f960b76d073 100644 --- a/gcc/config/i386/avxintrin.h +++ b/gcc/config/i386/avxintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Free Software Foundation, Inc. +/* Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/avxmath.h b/gcc/config/i386/avxmath.h index cc9f966844b..e444993a0eb 100644 --- a/gcc/config/i386/avxmath.h +++ b/gcc/config/i386/avxmath.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 Free Software Foundation, Inc. +/* Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/bdver1.md b/gcc/config/i386/bdver1.md index c32787acecd..578bb3b212a 100644 --- a/gcc/config/i386/bdver1.md +++ b/gcc/config/i386/bdver1.md @@ -1,4 +1,4 @@ -;; Copyright (C) 2010-2013 Free Software Foundation, Inc. +;; Copyright (C) 2010-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/bdver3.md b/gcc/config/i386/bdver3.md index 019e9291b1b..4c85ade3176 100644 --- a/gcc/config/i386/bdver3.md +++ b/gcc/config/i386/bdver3.md @@ -1,4 +1,4 @@ -;; Copyright (C) 2012-2013 Free Software Foundation, Inc. +;; Copyright (C) 2012-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/biarch64.h b/gcc/config/i386/biarch64.h index 5b5e86b03b7..c2c81640117 100644 --- a/gcc/config/i386/biarch64.h +++ b/gcc/config/i386/biarch64.h @@ -1,7 +1,7 @@ /* Make configure files to produce biarch compiler defaulting to 64bit mode. This file must be included very first, while the OS specific file later to overwrite otherwise wrong defaults. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by Bo Thorsen <bo@suse.de>. This file is part of GCC. diff --git a/gcc/config/i386/biarchx32.h b/gcc/config/i386/biarchx32.h index 1c919753b0d..941b93b3d4c 100644 --- a/gcc/config/i386/biarchx32.h +++ b/gcc/config/i386/biarchx32.h @@ -1,7 +1,7 @@ /* Make configure files to produce biarch compiler defaulting to x32 mode. This file must be included very first, while the OS specific file later to overwrite otherwise wrong defaults. - Copyright (C) 2012-2013 Free Software Foundation, Inc. + Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h index 0c6cb9616c8..ff962962e95 100644 --- a/gcc/config/i386/bmi2intrin.h +++ b/gcc/config/i386/bmi2intrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Free Software Foundation, Inc. +/* Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h index 281ebaaf4f2..b86adf179cf 100644 --- a/gcc/config/i386/bmiintrin.h +++ b/gcc/config/i386/bmiintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 Free Software Foundation, Inc. +/* Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/bmmintrin.h b/gcc/config/i386/bmmintrin.h index 9d68cecf326..24cf26e8522 100644 --- a/gcc/config/i386/bmmintrin.h +++ b/gcc/config/i386/bmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/bsd.h b/gcc/config/i386/bsd.h index 0d2a2537b46..54715a85330 100644 --- a/gcc/config/i386/bsd.h +++ b/gcc/config/i386/bsd.h @@ -1,7 +1,7 @@ /* Definitions for BSD assembler syntax for Intel 386 (actually AT&T syntax for insns and operands, adapted to BSD conventions for symbol names and debugging.) - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/btver2.md b/gcc/config/i386/btver2.md index e872d602f58..06a97cb9531 100644 --- a/gcc/config/i386/btver2.md +++ b/gcc/config/i386/btver2.md @@ -1,4 +1,4 @@ -;; Copyright (C) 2012, Free Software Foundation, Inc. +;; Copyright (C) 2012-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 0bc53aabcb4..0d61c87a4e9 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -1,5 +1,5 @@ ;; Constraint definitions for IA-32 and x86-64. -;; Copyright (C) 2006-2013 Free Software Foundation, Inc. +;; Copyright (C) 2006-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/core2.md b/gcc/config/i386/core2.md index 2e42675bbcb..53df9eede83 100644 --- a/gcc/config/i386/core2.md +++ b/gcc/config/i386/core2.md @@ -1,5 +1,5 @@ ;; Scheduling for Core 2 and derived processors. -;; Copyright (C) 2004-2013 Free Software Foundation, Inc. +;; Copyright (C) 2004-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -102,12 +102,12 @@ ;; on decoder 0, and say that it takes a little while before the result ;; is available. (define_insn_reservation "c2_complex_insn" 6 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "other,multi,str")) "c2_decoder0") (define_insn_reservation "c2_call" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "call,callv")) "c2_decoder0") @@ -115,50 +115,50 @@ ;; imovx always decodes to one uop, and also doesn't use the integer ;; units if it has memory operands. (define_insn_reservation "c2_imov" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "imov,imovx"))) "c2_decodern,(c2_p0|c2_p1|c2_p5)") (define_insn_reservation "c2_imov_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "imov,imovx"))) "c2_decodern,c2_p2") (define_insn_reservation "c2_imov_store" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (eq_attr "type" "imov"))) "c2_decodern,c2_p4+c2_p3") (define_insn_reservation "c2_icmov" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "icmov"))) "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2") (define_insn_reservation "c2_icmov_load" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "icmov"))) "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2") (define_insn_reservation "c2_push_reg" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (eq_attr "type" "push"))) "c2_decodern,c2_p4+c2_p3") (define_insn_reservation "c2_push_mem" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "both") (eq_attr "type" "push"))) "c2_decoder0,c2_p2,c2_p4+c2_p3") ;; lea executes on port 0 with latency one and throughput 1. (define_insn_reservation "c2_lea" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "lea"))) "c2_decodern,c2_p0") @@ -167,61 +167,61 @@ ;; The load and store units need to be reserved when memory operands ;; are involved. (define_insn_reservation "c2_shift_rotate" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) "c2_decodern,(c2_p0|c2_p5)") (define_insn_reservation "c2_shift_rotate_mem" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3") ;; See comments in ppro.md for the corresponding reservation. (define_insn_reservation "c2_branch" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "ibr"))) "c2_decodern,c2_p5") ;; ??? Indirect branches probably have worse latency than this. (define_insn_reservation "c2_indirect_branch" 6 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (eq_attr "type" "ibr"))) "c2_decoder0,c2_p2+c2_p5") (define_insn_reservation "c2_leave" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "leave")) "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)") ;; mul and imul with two/three operands only execute on port 1 for HImode ;; and SImode, port 0 for DImode. (define_insn_reservation "c2_imul_hisi" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "HI,SI") (eq_attr "type" "imul")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_imul_hisi_mem" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (and (eq_attr "mode" "HI,SI") (eq_attr "type" "imul")))) "c2_decoder0,c2_p2+c2_p1") (define_insn_reservation "c2_imul_di" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DI") (eq_attr "type" "imul")))) "c2_decodern,c2_p0") (define_insn_reservation "c2_imul_di_mem" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (and (eq_attr "mode" "DI") (eq_attr "type" "imul")))) @@ -231,42 +231,42 @@ ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. ;; These issue latencies are modelled via the c2_div automaton. (define_insn_reservation "c2_idiv_QI" 19 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "QI") (eq_attr "type" "idiv")))) "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") (define_insn_reservation "c2_idiv_QI_load" 19 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "QI") (eq_attr "type" "idiv")))) "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") (define_insn_reservation "c2_idiv_HI" 23 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "HI") (eq_attr "type" "idiv")))) "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17") (define_insn_reservation "c2_idiv_HI_load" 23 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "HI") (eq_attr "type" "idiv")))) "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18") (define_insn_reservation "c2_idiv_SI" 39 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SI") (eq_attr "type" "idiv")))) "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33") (define_insn_reservation "c2_idiv_SI_load" 39 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "SI") (eq_attr "type" "idiv")))) @@ -275,90 +275,90 @@ ;; x87 floating point operations. (define_insn_reservation "c2_fxch" 0 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "fxch")) "c2_decodern") (define_insn_reservation "c2_fop" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none,unknown") (eq_attr "type" "fop"))) "c2_decodern,c2_p1") (define_insn_reservation "c2_fop_load" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "fop"))) "c2_decoder0,c2_p2+c2_p1,c2_p1") (define_insn_reservation "c2_fop_store" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (eq_attr "type" "fop"))) "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3") (define_insn_reservation "c2_fop_both" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "both") (eq_attr "type" "fop"))) "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3") (define_insn_reservation "c2_fsgn" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "fsgn")) "c2_decodern,c2_p0") (define_insn_reservation "c2_fistp" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "fistp")) "c2_decoder0,c2_p0*2,c2_p4+c2_p3") (define_insn_reservation "c2_fcmov" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (eq_attr "type" "fcmov")) "c2_decoder0,c2_p0*2") (define_insn_reservation "c2_fcmp" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "fcmp"))) "c2_decodern,c2_p1") (define_insn_reservation "c2_fcmp_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "fcmp"))) "c2_decoder0,c2_p2+c2_p1") (define_insn_reservation "c2_fmov" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "fmov"))) "c2_decodern,c2_p0") (define_insn_reservation "c2_fmov_load" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "!XF") (eq_attr "type" "fmov")))) "c2_decodern,c2_p2") (define_insn_reservation "c2_fmov_XF_load" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "XF") (eq_attr "type" "fmov")))) "c2_decoder0,(c2_p2+c2_p0)*2") (define_insn_reservation "c2_fmov_store" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (and (eq_attr "mode" "!XF") (eq_attr "type" "fmov")))) "c2_decodern,c2_p3+c2_p4") (define_insn_reservation "c2_fmov_XF_store" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (and (eq_attr "mode" "XF") (eq_attr "type" "fmov")))) @@ -367,13 +367,13 @@ ;; fmul executes on port 0 with latency 5. It has issue latency 2, ;; but we don't model this. (define_insn_reservation "c2_fmul" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "fmul"))) "c2_decoder0,c2_p0*2") (define_insn_reservation "c2_fmul_load" 6 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "fmul"))) "c2_decoder0,c2_p2+c2_p0,c2_p0") @@ -384,42 +384,42 @@ ;; that. Throughput is equal to latency - 1, which we model using the ;; c2_div automaton. (define_insn_reservation "c2_fdiv_SF" 18 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SF") (eq_attr "type" "fdiv,fpspc")))) "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16") (define_insn_reservation "c2_fdiv_SF_load" 19 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "SF") (eq_attr "type" "fdiv,fpspc")))) "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16") (define_insn_reservation "c2_fdiv_DF" 32 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DF") (eq_attr "type" "fdiv,fpspc")))) "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30") (define_insn_reservation "c2_fdiv_DF_load" 33 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "DF") (eq_attr "type" "fdiv,fpspc")))) "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30") (define_insn_reservation "c2_fdiv_XF" 38 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "XF") (eq_attr "type" "fdiv,fpspc")))) "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36") (define_insn_reservation "c2_fdiv_XF_load" 39 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "XF") (eq_attr "type" "fdiv,fpspc")))) @@ -428,71 +428,71 @@ ;; MMX instructions. (define_insn_reservation "c2_mmx_add" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "mmxadd,sseiadd"))) "c2_decodern,c2_p0|c2_p5") (define_insn_reservation "c2_mmx_add_load" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "mmxadd,sseiadd"))) "c2_decodern,c2_p2+c2_p0|c2_p5") (define_insn_reservation "c2_mmx_shft" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "mmxshft"))) "c2_decodern,c2_p0|c2_p5") (define_insn_reservation "c2_mmx_shft_load" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "mmxshft"))) "c2_decoder0,c2_p2+c2_p1") (define_insn_reservation "c2_mmx_sse_shft" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "type" "sseishft") (eq_attr "length_immediate" "!0")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_mmx_sse_shft_load" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "type" "sseishft") (eq_attr "length_immediate" "!0")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_mmx_sse_shft1" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "type" "sseishft") (eq_attr "length_immediate" "0")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_mmx_sse_shft1_load" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "type" "sseishft") (eq_attr "length_immediate" "0")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_mmx_mul" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "mmxmul,sseimul"))) "c2_decodern,c2_p1") (define_insn_reservation "c2_mmx_mul_load" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "mmxmul,sseimul"))) "c2_decoder0,c2_p2+c2_p1") (define_insn_reservation "c2_sse_mmxcvt" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "mode" "DI") (eq_attr "type" "mmxcvt"))) "c2_decodern,c2_p1") @@ -500,94 +500,94 @@ ;; FIXME: These are Pentium III only, but we cannot tell here if ;; we're generating code for PentiumPro/Pentium II or Pentium III ;; (define_insn_reservation "c2_sse_mmxshft" 2 -;; (and (eq_attr "cpu" "core2,corei7") +;; (and (eq_attr "cpu" "core2,nehalem") ;; (and (eq_attr "mode" "TI") ;; (eq_attr "type" "mmxshft"))) ;; "c2_decodern,c2_p0") ;; The sfence instruction. (define_insn_reservation "c2_sse_sfence" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "unknown") (eq_attr "type" "sse"))) "c2_decoder0,c2_p4+c2_p3") ;; FIXME: This reservation is all wrong when we're scheduling sqrtss. (define_insn_reservation "c2_sse_SFDF" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "mode" "SF,DF") (eq_attr "type" "sse"))) "c2_decodern,c2_p0") (define_insn_reservation "c2_sse_V4SF" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "mode" "V4SF") (eq_attr "type" "sse"))) "c2_decoder0,c2_p1*2") (define_insn_reservation "c2_sse_addcmp" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi"))) "c2_decodern,c2_p1") (define_insn_reservation "c2_sse_addcmp_load" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi"))) "c2_decodern,c2_p2+c2_p1") (define_insn_reservation "c2_sse_mul_SF" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SF,V4SF") (eq_attr "type" "ssemul")))) "c2_decodern,c2_p0") (define_insn_reservation "c2_sse_mul_SF_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "SF,V4SF") (eq_attr "type" "ssemul")))) "c2_decodern,c2_p2+c2_p0") (define_insn_reservation "c2_sse_mul_DF" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DF,V2DF") (eq_attr "type" "ssemul")))) "c2_decodern,c2_p0") (define_insn_reservation "c2_sse_mul_DF_load" 5 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (and (eq_attr "mode" "DF,V2DF") (eq_attr "type" "ssemul")))) "c2_decodern,c2_p2+c2_p0") (define_insn_reservation "c2_sse_div_SF" 18 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SF,V4SF") (eq_attr "type" "ssediv")))) "c2_decodern,c2_p0,c2_ssediv*17") (define_insn_reservation "c2_sse_div_SF_load" 18 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SF,V4SF") (eq_attr "type" "ssediv")))) "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17") (define_insn_reservation "c2_sse_div_DF" 32 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DF,V2DF") (eq_attr "type" "ssediv")))) "c2_decodern,c2_p0,c2_ssediv*31") (define_insn_reservation "c2_sse_div_DF_load" 32 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DF,V2DF") (eq_attr "type" "ssediv")))) @@ -595,61 +595,61 @@ ;; FIXME: these have limited throughput (define_insn_reservation "c2_sse_icvt_SF" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SF") (eq_attr "type" "sseicvt")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_sse_icvt_SF_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (and (eq_attr "mode" "SF") (eq_attr "type" "sseicvt")))) "c2_decodern,c2_p2+c2_p1") (define_insn_reservation "c2_sse_icvt_DF" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "DF") (eq_attr "type" "sseicvt")))) "c2_decoder0,c2_p0+c2_p1") (define_insn_reservation "c2_sse_icvt_DF_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (and (eq_attr "mode" "DF") (eq_attr "type" "sseicvt")))) "c2_decoder0,(c2_p2+c2_p1)") (define_insn_reservation "c2_sse_icvt_SI" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (and (eq_attr "mode" "SI") (eq_attr "type" "sseicvt")))) "c2_decodern,c2_p1") (define_insn_reservation "c2_sse_icvt_SI_load" 3 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "!none") (and (eq_attr "mode" "SI") (eq_attr "type" "sseicvt")))) "c2_decodern,(c2_p2+c2_p1)") (define_insn_reservation "c2_sse_mov" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none") (eq_attr "type" "ssemov"))) "c2_decodern,(c2_p0|c2_p1|c2_p5)") (define_insn_reservation "c2_sse_mov_load" 2 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "ssemov"))) "c2_decodern,c2_p2") (define_insn_reservation "c2_sse_mov_store" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (eq_attr "type" "ssemov"))) "c2_decodern,c2_p4+c2_p3") @@ -663,13 +663,13 @@ ;; the three decoders. Loads benefit from micro-op fusion and can be ;; treated in the same way. (define_insn_reservation "c2_insn" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "none,unknown") (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) "c2_decodern,(c2_p0|c2_p1|c2_p5)") (define_insn_reservation "c2_insn_load" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "load") (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)") @@ -677,7 +677,7 @@ ;; register-memory instructions have three uops, so they have to be ;; decoded on c2_decoder0. (define_insn_reservation "c2_insn_store" 1 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "store") (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") @@ -685,7 +685,7 @@ ;; read-modify-store instructions produce 4 uops so they have to be ;; decoded on c2_decoder0 as well. (define_insn_reservation "c2_insn_both" 4 - (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "cpu" "core2,nehalem") (and (eq_attr "memory" "both") (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index aa91e1ab8d8..c7a53dd7c8b 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2013 Free Software Foundation, Inc. + * Copyright (C) 2007-2014 Free Software Foundation, Inc. * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -77,6 +77,7 @@ #define bit_AVX512PF (1 << 26) #define bit_AVX512ER (1 << 27) #define bit_AVX512CD (1 << 28) +#define bit_SHA (1 << 29) /* Extended State Enumeration Sub-leaf (%eax == 13, %ecx == 1) */ #define bit_XSAVEOPT (1 << 0) diff --git a/gcc/config/i386/cross-stdarg.h b/gcc/config/i386/cross-stdarg.h index f934cf0933b..d16cef82086 100644 --- a/gcc/config/i386/cross-stdarg.h +++ b/gcc/config/i386/cross-stdarg.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2013 Free Software Foundation, Inc. +/* Copyright (C) 2002-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/crtdll.h b/gcc/config/i386/crtdll.h index 99518ab56fa..b18168ee74b 100644 --- a/gcc/config/i386/crtdll.h +++ b/gcc/config/i386/crtdll.h @@ -1,7 +1,7 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows32, using GNU tools and the Windows32 API Library. This variant uses CRTDLL.DLL instead of MSVCRTDLL.DLL. - Copyright (C) 1998-2013 Free Software Foundation, Inc. + Copyright (C) 1998-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h index 9cb66d646be..039edccb979 100644 --- a/gcc/config/i386/cygming.h +++ b/gcc/config/i386/cygming.h @@ -1,6 +1,6 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows32, using a Unix style C library and tools. - Copyright (C) 1995-2013 Free Software Foundation, Inc. + Copyright (C) 1995-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/cygming.opt b/gcc/config/i386/cygming.opt index 01fa0b6e46d..3437123f40e 100644 --- a/gcc/config/i386/cygming.opt +++ b/gcc/config/i386/cygming.opt @@ -1,6 +1,6 @@ ; Cygwin- and MinGW-specific options. -; Copyright (C) 2005-2013 Free Software Foundation, Inc. +; Copyright (C) 2005-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/cygwin-stdint.h b/gcc/config/i386/cygwin-stdint.h index 2dc207b5e45..3c82cc6c35a 100644 --- a/gcc/config/i386/cygwin-stdint.h +++ b/gcc/config/i386/cygwin-stdint.h @@ -1,5 +1,5 @@ /* Definitions for <stdint.h> types on systems using Cygwin. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/cygwin-w64.h b/gcc/config/i386/cygwin-w64.h index e39ace98ea0..06a6cd98c1b 100644 --- a/gcc/config/i386/cygwin-w64.h +++ b/gcc/config/i386/cygwin-w64.h @@ -1,7 +1,7 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows 32/64 via Cygwin runtime, using GNU tools and the Windows API Library. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h index 3fe0a011b8f..f7b9a284c12 100644 --- a/gcc/config/i386/cygwin.h +++ b/gcc/config/i386/cygwin.h @@ -1,6 +1,6 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows32, using a Unix style C library and tools. - Copyright (C) 1995-2013 Free Software Foundation, Inc. + Copyright (C) 1995-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index 594ec999484..5c9b073b680 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -1,5 +1,5 @@ /* Target definitions for x86 running Darwin. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by Apple Computer Inc. This file is part of GCC. diff --git a/gcc/config/i386/darwin64.h b/gcc/config/i386/darwin64.h index 2a61f5b2475..143971136f8 100644 --- a/gcc/config/i386/darwin64.h +++ b/gcc/config/i386/darwin64.h @@ -1,5 +1,5 @@ /* Target definitions for x86_64 running Darwin. - Copyright (C) 2006-2013 Free Software Foundation, Inc. + Copyright (C) 2006-2014 Free Software Foundation, Inc. Contributed by Apple Computer Inc. This file is part of GCC. diff --git a/gcc/config/i386/djgpp-stdint.h b/gcc/config/i386/djgpp-stdint.h index 3bc465f0c16..5af1771b248 100644 --- a/gcc/config/i386/djgpp-stdint.h +++ b/gcc/config/i386/djgpp-stdint.h @@ -1,5 +1,5 @@ /* Definitions for <stdint.h> types on systems using DJGPP. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/djgpp.h b/gcc/config/i386/djgpp.h index cc420d0a6d6..6ddd833bab7 100644 --- a/gcc/config/i386/djgpp.h +++ b/gcc/config/i386/djgpp.h @@ -1,5 +1,5 @@ /* Configuration for an i386 running MS-DOS with DJGPP. - Copyright (C) 1997-2013 Free Software Foundation, Inc. + Copyright (C) 1997-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/djgpp.opt b/gcc/config/i386/djgpp.opt index 8213fc3be78..21fc54afb75 100644 --- a/gcc/config/i386/djgpp.opt +++ b/gcc/config/i386/djgpp.opt @@ -1,6 +1,6 @@ ; DJGPP-specific options. -; Copyright (C) 2005-2013 Free Software Foundation, Inc. +; Copyright (C) 2005-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c index 0b8af3f4ffd..b6eb7e71f66 100644 --- a/gcc/config/i386/driver-i386.c +++ b/gcc/config/i386/driver-i386.c @@ -1,5 +1,5 @@ /* Subroutines for the gcc driver. - Copyright (C) 2006-2013 Free Software Foundation, Inc. + Copyright (C) 2006-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -126,6 +126,18 @@ decode_caches_intel (unsigned reg, bool xeon_mp, case 0x0c: level1->sizekb = 16; level1->assoc = 4; level1->line = 32; break; + case 0x0d: + level1->sizekb = 16; level1->assoc = 4; level1->line = 64; + break; + case 0x0e: + level1->sizekb = 24; level1->assoc = 6; level1->line = 64; + break; + case 0x21: + level2->sizekb = 256; level2->assoc = 8; level2->line = 64; + break; + case 0x24: + level2->sizekb = 1024; level2->assoc = 16; level2->line = 64; + break; case 0x2c: level1->sizekb = 32; level1->assoc = 8; level1->line = 64; break; @@ -162,6 +174,9 @@ decode_caches_intel (unsigned reg, bool xeon_mp, case 0x45: level2->sizekb = 2048; level2->assoc = 4; level2->line = 32; break; + case 0x48: + level2->sizekb = 3072; level2->assoc = 12; level2->line = 64; + break; case 0x49: if (xeon_mp) break; @@ -203,6 +218,9 @@ decode_caches_intel (unsigned reg, bool xeon_mp, case 0x7f: level2->sizekb = 512; level2->assoc = 2; level2->line = 64; break; + case 0x80: + level2->sizekb = 512; level2->assoc = 8; level2->line = 64; + break; case 0x82: level2->sizekb = 256; level2->assoc = 8; level2->line = 32; break; @@ -391,7 +409,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) unsigned int has_rdseed = 0, has_prfchw = 0, has_adx = 0; unsigned int has_osxsave = 0, has_fxsr = 0, has_xsave = 0, has_xsaveopt = 0; unsigned int has_avx512er = 0, has_avx512pf = 0, has_avx512cd = 0; - unsigned int has_avx512f = 0; + unsigned int has_avx512f = 0, has_sha = 0; bool arch; @@ -467,6 +485,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) has_avx512er = ebx & bit_AVX512ER; has_avx512pf = ebx & bit_AVX512PF; has_avx512cd = ebx & bit_AVX512CD; + has_sha = ebx & bit_SHA; } if (max_level >= 13) @@ -643,13 +662,13 @@ const char *host_detect_local_cpu (int argc, const char **argv) { case 0x1c: case 0x26: - /* Atom. */ - cpu = "atom"; + /* Bonnell. */ + cpu = "bonnell"; break; case 0x37: case 0x4d: /* Silvermont. */ - cpu = "slm"; + cpu = "silvermont"; break; case 0x0f: /* Merom. */ @@ -663,52 +682,56 @@ const char *host_detect_local_cpu (int argc, const char **argv) case 0x1f: case 0x2e: /* Nehalem. */ + cpu = "nehalem"; + break; case 0x25: case 0x2c: case 0x2f: /* Westmere. */ - cpu = "corei7"; + cpu = "westmere"; break; case 0x2a: case 0x2d: /* Sandy Bridge. */ - cpu = "corei7-avx"; + cpu = "sandybridge"; break; case 0x3a: case 0x3e: /* Ivy Bridge. */ - cpu = "core-avx-i"; + cpu = "ivybridge"; break; case 0x3c: case 0x45: case 0x46: /* Haswell. */ - cpu = "core-avx2"; + cpu = "haswell"; break; default: if (arch) { /* This is unknown family 0x6 CPU. */ - if (has_avx2) + if (has_adx) + cpu = "broadwell"; + else if (has_avx2) /* Assume Haswell. */ - cpu = "core-avx2"; + cpu = "haswell"; else if (has_avx) /* Assume Sandy Bridge. */ - cpu = "corei7-avx"; + cpu = "sandybridge"; else if (has_sse4_2) { if (has_movbe) - /* Assume SLM. */ - cpu = "slm"; + /* Assume Silvermont. */ + cpu = "silvermont"; else - /* Assume Core i7. */ - cpu = "corei7"; + /* Assume Nehalem. */ + cpu = "nehalem"; } else if (has_ssse3) { if (has_movbe) - /* Assume Atom. */ - cpu = "atom"; + /* Assume Bonnell. */ + cpu = "bonnell"; else /* Assume Core 2. */ cpu = "core2"; @@ -828,6 +851,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) const char *sahf = has_lahf_lm ? " -msahf" : " -mno-sahf"; const char *movbe = has_movbe ? " -mmovbe" : " -mno-movbe"; const char *aes = has_aes ? " -maes" : " -mno-aes"; + const char *sha = has_sha ? " -msha" : " -mno-sha"; const char *pclmul = has_pclmul ? " -mpclmul" : " -mno-pclmul"; const char *popcnt = has_popcnt ? " -mpopcnt" : " -mno-popcnt"; const char *abm = has_abm ? " -mabm" : " -mno-abm"; @@ -860,7 +884,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) const char *avx512pf = has_avx512pf ? " -mavx512pf" : " -mno-avx512pf"; options = concat (options, mmx, mmx3dnow, sse, sse2, sse3, ssse3, - sse4a, cx16, sahf, movbe, aes, pclmul, + sse4a, cx16, sahf, movbe, aes, sha, pclmul, popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2, tbm, avx, avx2, sse4_2, sse4_1, lzcnt, rtm, hle, rdrnd, f16c, fsgsbase, rdseed, prfchw, adx, diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index c30f05657d6..08928fbfabd 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Free Software Foundation, Inc. +/* Copyright (C) 2003-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/f16cintrin.h b/gcc/config/i386/f16cintrin.h index 76f35fa1eac..229f4e3bd0a 100644 --- a/gcc/config/i386/f16cintrin.h +++ b/gcc/config/i386/f16cintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Free Software Foundation, Inc. +/* Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/fma4intrin.h b/gcc/config/i386/fma4intrin.h index e615f3e7ba0..e1bdef7b571 100644 --- a/gcc/config/i386/fma4intrin.h +++ b/gcc/config/i386/fma4intrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/fmaintrin.h b/gcc/config/i386/fmaintrin.h index 97de93fd146..bfbb75d5988 100644 --- a/gcc/config/i386/fmaintrin.h +++ b/gcc/config/i386/fmaintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Free Software Foundation, Inc. +/* Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/freebsd.h b/gcc/config/i386/freebsd.h index b5ed55bbd63..bdca1b80b27 100644 --- a/gcc/config/i386/freebsd.h +++ b/gcc/config/i386/freebsd.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running FreeBSD with ELF format - Copyright (C) 1996-2013 Free Software Foundation, Inc. + Copyright (C) 1996-2014 Free Software Foundation, Inc. Contributed by Eric Youngdale. Modified for stabs-in-ELF by H.J. Lu. Adapted from GNU/Linux version by John Polstra. diff --git a/gcc/config/i386/freebsd64.h b/gcc/config/i386/freebsd64.h index b032b385419..89430c43210 100644 --- a/gcc/config/i386/freebsd64.h +++ b/gcc/config/i386/freebsd64.h @@ -1,5 +1,5 @@ /* Definitions for AMD x86-64 running FreeBSD with ELF format - Copyright (C) 2002-2013 Free Software Foundation, Inc. + Copyright (C) 2002-2014 Free Software Foundation, Inc. Contributed by David O'Brien <obrien@FreeBSD.org> This file is part of GCC. diff --git a/gcc/config/i386/fxsrintrin.h b/gcc/config/i386/fxsrintrin.h index 41d4085b010..98e73ee2742 100644 --- a/gcc/config/i386/fxsrintrin.h +++ b/gcc/config/i386/fxsrintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/gas.h b/gcc/config/i386/gas.h index 66519321dcc..edefc9aad04 100644 --- a/gcc/config/i386/gas.h +++ b/gcc/config/i386/gas.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 using GAS. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/geode.md b/gcc/config/i386/geode.md index 9daee62bb9b..7ca39ae35f4 100644 --- a/gcc/config/i386/geode.md +++ b/gcc/config/i386/geode.md @@ -1,5 +1,5 @@ ;; Geode Scheduling -;; Copyright (C) 2006-2013 Free Software Foundation, Inc. +;; Copyright (C) 2006-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/gmm_malloc.h b/gcc/config/i386/gmm_malloc.h index 77855fe3abd..516b13b9c5e 100644 --- a/gcc/config/i386/gmm_malloc.h +++ b/gcc/config/i386/gmm_malloc.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2013 Free Software Foundation, Inc. +/* Copyright (C) 2004-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h index e28483dd1dd..b34528217f1 100644 --- a/gcc/config/i386/gnu-user-common.h +++ b/gcc/config/i386/gnu-user-common.h @@ -1,5 +1,5 @@ /* Common definitions for Intel 386 and AMD x86-64 systems using - GNU userspace. Copyright (C) 2012-2013 Free Software Foundation, Inc. + GNU userspace. Copyright (C) 2012-2014 Free Software Foundation, Inc. Contributed by Ilya Enkovich. This file is part of GCC. diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h index c93d9757d50..e1163c9dade 100644 --- a/gcc/config/i386/gnu-user.h +++ b/gcc/config/i386/gnu-user.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 systems using GNU userspace. - Copyright (C) 1994-2013 Free Software Foundation, Inc. + Copyright (C) 1994-2014 Free Software Foundation, Inc. Contributed by Eric Youngdale. Modified for stabs-in-ELF by H.J. Lu. diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h index 952bb5b475c..8d3348368a7 100644 --- a/gcc/config/i386/gnu-user64.h +++ b/gcc/config/i386/gnu-user64.h @@ -1,5 +1,5 @@ /* Definitions for AMD x86-64 using GNU userspace. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by Jan Hubicka <jh@suse.cz>, based on linux.h. This file is part of GCC. diff --git a/gcc/config/i386/gnu.h b/gcc/config/i386/gnu.h index 4a91c843685..29896e9af57 100644 --- a/gcc/config/i386/gnu.h +++ b/gcc/config/i386/gnu.h @@ -1,7 +1,7 @@ /* Configuration for an i386 running GNU with ELF as the target machine. */ /* -Copyright (C) 1994-2013 Free Software Foundation, Inc. +Copyright (C) 1994-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/host-cygwin.c b/gcc/config/i386/host-cygwin.c index 7a8152a71e4..0bc04bb5e62 100644 --- a/gcc/config/i386/host-cygwin.c +++ b/gcc/config/i386/host-cygwin.c @@ -1,5 +1,5 @@ /* Cygwin host-specific hook definitions. - Copyright (C) 2004-2013 Free Software Foundation, Inc. + Copyright (C) 2004-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/host-i386-darwin.c b/gcc/config/i386/host-i386-darwin.c index 44f7db1f6df..9325e8dd34a 100644 --- a/gcc/config/i386/host-i386-darwin.c +++ b/gcc/config/i386/host-i386-darwin.c @@ -1,5 +1,5 @@ /* i386-darwin host-specific hook definitions. - Copyright (C) 2003-2013 Free Software Foundation, Inc. + Copyright (C) 2003-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/host-mingw32.c b/gcc/config/i386/host-mingw32.c index 6773fad40ab..fc01ceb243e 100644 --- a/gcc/config/i386/host-mingw32.c +++ b/gcc/config/i386/host-mingw32.c @@ -1,5 +1,5 @@ /* mingw32 host-specific hook definitions. - Copyright (C) 2004-2013 Free Software Foundation, Inc. + Copyright (C) 2004-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/i386-builtin-types.awk b/gcc/config/i386/i386-builtin-types.awk index 5e3b315f59e..3fc1455ece4 100644 --- a/gcc/config/i386/i386-builtin-types.awk +++ b/gcc/config/i386/i386-builtin-types.awk @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2013 Free Software Foundation, Inc. +# Copyright (C) 2009-2014 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index c866170bde8..d19ca84ae2f 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -99,6 +99,15 @@ DEF_VECTOR_TYPE (V16HI, HI) DEF_VECTOR_TYPE (V32QI, QI) DEF_VECTOR_TYPE (V4UDI, UDI, V4DI) DEF_VECTOR_TYPE (V8USI, USI, V8SI) +DEF_VECTOR_TYPE (V16UHI, UHI, V16HI) + +# AVX512F vectors +DEF_VECTOR_TYPE (V32SF, FLOAT) +DEF_VECTOR_TYPE (V16SF, FLOAT) +DEF_VECTOR_TYPE (V8DF, DOUBLE) +DEF_VECTOR_TYPE (V8DI, DI) +DEF_VECTOR_TYPE (V16SI, SI) +DEF_VECTOR_TYPE (V64QI, QI) DEF_POINTER_TYPE (PCCHAR, CHAR, CONST) DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST) @@ -123,21 +132,29 @@ DEF_POINTER_TYPE (PV2SF, V2SF) DEF_POINTER_TYPE (PV4DF, V4DF) DEF_POINTER_TYPE (PV4DI, V4DI) DEF_POINTER_TYPE (PV4SF, V4SF) +DEF_POINTER_TYPE (PV8DF, V8DF) DEF_POINTER_TYPE (PV8SF, V8SF) DEF_POINTER_TYPE (PV4SI, V4SI) DEF_POINTER_TYPE (PV8SI, V8SI) +DEF_POINTER_TYPE (PV8DI, V8DI) +DEF_POINTER_TYPE (PV16SI, V16SI) +DEF_POINTER_TYPE (PV16SF, V16SF) DEF_POINTER_TYPE (PCV2SI, V2SI, CONST) DEF_POINTER_TYPE (PCV2DF, V2DF, CONST) DEF_POINTER_TYPE (PCV2SF, V2SF, CONST) DEF_POINTER_TYPE (PCV4DF, V4DF, CONST) DEF_POINTER_TYPE (PCV4SF, V4SF, CONST) +DEF_POINTER_TYPE (PCV8DF, V8DF, CONST) DEF_POINTER_TYPE (PCV8SF, V8SF, CONST) +DEF_POINTER_TYPE (PCV16SF, V16SF, CONST) DEF_POINTER_TYPE (PCV2DI, V2DI, CONST) DEF_POINTER_TYPE (PCV4SI, V4SI, CONST) DEF_POINTER_TYPE (PCV4DI, V4DI, CONST) DEF_POINTER_TYPE (PCV8SI, V8SI, CONST) +DEF_POINTER_TYPE (PCV8DI, V8DI, CONST) +DEF_POINTER_TYPE (PCV16SI, V16SI, CONST) DEF_FUNCTION_TYPE (FLOAT128) DEF_FUNCTION_TYPE (UINT64) @@ -165,6 +182,7 @@ DEF_FUNCTION_TYPE (UINT16, UINT16) DEF_FUNCTION_TYPE (UINT64, PUNSIGNED) DEF_FUNCTION_TYPE (V16QI, PCCHAR) DEF_FUNCTION_TYPE (V16QI, V16QI) +DEF_FUNCTION_TYPE (V16QI, V16SI) DEF_FUNCTION_TYPE (V2DF, PCDOUBLE) DEF_FUNCTION_TYPE (V2DF, V2DF) DEF_FUNCTION_TYPE (V2DF, V2SI) @@ -190,6 +208,8 @@ DEF_FUNCTION_TYPE (V4DF, V2DF) DEF_FUNCTION_TYPE (V4DF, V4DF) DEF_FUNCTION_TYPE (V4DF, V4SF) DEF_FUNCTION_TYPE (V4DF, V4SI) +DEF_FUNCTION_TYPE (V8DF, V8SI) +DEF_FUNCTION_TYPE (V8DF, V8DF) DEF_FUNCTION_TYPE (V4HI, V4HI) DEF_FUNCTION_TYPE (V4SF, PCFLOAT) DEF_FUNCTION_TYPE (V4SF, V2DF) @@ -207,6 +227,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI) DEF_FUNCTION_TYPE (V4SI, V8HI) DEF_FUNCTION_TYPE (V4SI, V8SI) DEF_FUNCTION_TYPE (V8HI, V16QI) +DEF_FUNCTION_TYPE (V8HI, V8DI) DEF_FUNCTION_TYPE (V8HI, V8HI) DEF_FUNCTION_TYPE (V8QI, V8QI) DEF_FUNCTION_TYPE (V8SF, PCFLOAT) @@ -216,10 +237,15 @@ DEF_FUNCTION_TYPE (V8SF, V4SF) DEF_FUNCTION_TYPE (V8SF, V8SF) DEF_FUNCTION_TYPE (V8SF, V8SI) DEF_FUNCTION_TYPE (V8SF, V8HI) +DEF_FUNCTION_TYPE (V16SF, V16SF) +DEF_FUNCTION_TYPE (V8SI, V8DI) DEF_FUNCTION_TYPE (V8SI, V4SI) +DEF_FUNCTION_TYPE (V8SF, V8DF) +DEF_FUNCTION_TYPE (V8SF, V8DF, V8SF, QI) DEF_FUNCTION_TYPE (V8SI, V8SF) DEF_FUNCTION_TYPE (V32QI, V32QI) DEF_FUNCTION_TYPE (V32QI, V16QI) +DEF_FUNCTION_TYPE (V16HI, V16SI) DEF_FUNCTION_TYPE (V16HI, V16HI) DEF_FUNCTION_TYPE (V16HI, V8HI) DEF_FUNCTION_TYPE (V8SI, V8SI) @@ -239,6 +265,28 @@ DEF_FUNCTION_TYPE (V4DI, V8HI) DEF_FUNCTION_TYPE (V4DI, V4SI) DEF_FUNCTION_TYPE (V4DI, PV4DI) DEF_FUNCTION_TYPE (V4DI, V2DI) +DEF_FUNCTION_TYPE (V16SF, FLOAT) +DEF_FUNCTION_TYPE (V16SI, INT) +DEF_FUNCTION_TYPE (V8DF, DOUBLE) +DEF_FUNCTION_TYPE (V8DI, INT64) +DEF_FUNCTION_TYPE (V16SF, V4SF) +DEF_FUNCTION_TYPE (V8DF, V4DF) +DEF_FUNCTION_TYPE (V8DI, V4DI) +DEF_FUNCTION_TYPE (V16QI, V8DI) +DEF_FUNCTION_TYPE (UINT, V4SF) +DEF_FUNCTION_TYPE (UINT64, V4SF) +DEF_FUNCTION_TYPE (UINT, V2DF) +DEF_FUNCTION_TYPE (UINT64, V2DF) +DEF_FUNCTION_TYPE (V16SI, V16SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, HI) +DEF_FUNCTION_TYPE (V8DI, V8DI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, QI) +DEF_FUNCTION_TYPE (V16SI, PV4SI) +DEF_FUNCTION_TYPE (V16SF, PV4SF) +DEF_FUNCTION_TYPE (V8DI, PV4DI) +DEF_FUNCTION_TYPE (V8DF, PV4DF) +DEF_FUNCTION_TYPE (V8UHI, V8UHI) +DEF_FUNCTION_TYPE (V8USI, V8USI) DEF_FUNCTION_TYPE (DI, V2DI, INT) DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT) @@ -270,6 +318,8 @@ DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI) DEF_FUNCTION_TYPE (V1DI, V2SI, V2SI) DEF_FUNCTION_TYPE (V1DI, V8QI, V8QI) DEF_FUNCTION_TYPE (V2DF, PCV2DF, V2DI) +DEF_FUNCTION_TYPE (V2DF, V2DF, UINT) +DEF_FUNCTION_TYPE (V2DF, V2DF, UINT64) DEF_FUNCTION_TYPE (V2DF, V2DF, DI) DEF_FUNCTION_TYPE (V2DF, V2DF, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE) @@ -295,16 +345,23 @@ DEF_FUNCTION_TYPE (V2SI, V2SI, V2SI) DEF_FUNCTION_TYPE (V2SI, V4HI, V4HI) DEF_FUNCTION_TYPE (V4DF, PCV4DF, V4DI) DEF_FUNCTION_TYPE (V4DF, V4DF, INT) +DEF_FUNCTION_TYPE (V4DF, V8DF, INT) +DEF_FUNCTION_TYPE (V4DF, V8DF, INT, V4DF, QI) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI) DEF_FUNCTION_TYPE (V4HI, V2SI, V2SI) DEF_FUNCTION_TYPE (V4HI, V4HI, INT) DEF_FUNCTION_TYPE (V4HI, V4HI, SI) DEF_FUNCTION_TYPE (V4HI, V4HI, V4HI) DEF_FUNCTION_TYPE (V4HI, V8QI, V8QI) DEF_FUNCTION_TYPE (V4SF, PCV4SF, V4SI) +DEF_FUNCTION_TYPE (V4SF, V4SF, UINT) +DEF_FUNCTION_TYPE (V4SF, V4SF, UINT64) DEF_FUNCTION_TYPE (V4SF, V4SF, DI) DEF_FUNCTION_TYPE (V4SF, V4SF, INT) +DEF_FUNCTION_TYPE (INT, V4SF, V4SF, INT, INT) +DEF_FUNCTION_TYPE (INT, V2DF, V2DF, INT, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCV2SF) DEF_FUNCTION_TYPE (V4SF, V4SF, SI) DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF) @@ -331,30 +388,75 @@ DEF_FUNCTION_TYPE (V8QI, V4HI, V4HI) DEF_FUNCTION_TYPE (V8QI, V8QI, V8QI) DEF_FUNCTION_TYPE (V8SF, PCV8SF, V8SI) DEF_FUNCTION_TYPE (V8SF, V8SF, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, INT) +DEF_FUNCTION_TYPE (V4SF, V16SF, INT) +DEF_FUNCTION_TYPE (V4SF, V16SF, INT, V4SF, QI) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI) DEF_FUNCTION_TYPE (V32QI, V16HI, V16HI) DEF_FUNCTION_TYPE (V16HI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V4DF, INT, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DI, INT, QI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, HI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT, V16SI, HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT, QI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, QI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, QI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V4SF, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V4SF, INT, V16SF, HI) DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI) DEF_FUNCTION_TYPE (V16HI, V32QI, V32QI) DEF_FUNCTION_TYPE (V16HI, V16HI, V8HI) DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI) DEF_FUNCTION_TYPE (V16HI, V16HI, INT) +DEF_FUNCTION_TYPE (V16HI, V16SF, INT) +DEF_FUNCTION_TYPE (V16HI, V16SF, INT, V16HI, HI) +DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT, V16HI, HI) DEF_FUNCTION_TYPE (V16HI, V16HI, SI) DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT) DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, INT) DEF_FUNCTION_TYPE (V8SI, V4DF, V4DF) DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT) +DEF_FUNCTION_TYPE (V4SI, V16SI, INT) +DEF_FUNCTION_TYPE (V4SI, V16SI, INT, V4SI, QI) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, INT, V16SI, HI) DEF_FUNCTION_TYPE (V8SI, V16HI, V16HI) DEF_FUNCTION_TYPE (V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, SI) +DEF_FUNCTION_TYPE (V16SI, V16SI, INT) +DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, V16SI, HI) +DEF_FUNCTION_TYPE (V16SI, V16SI, INT, V16SI, HI) DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI) DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI) +DEF_FUNCTION_TYPE (V16SI, V8DF, V8DF) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, INT, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V4DI, INT, V8DI, QI) DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI) DEF_FUNCTION_TYPE (V4UDI, V8USI, V8USI) DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI) DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI) +DEF_FUNCTION_TYPE (V4DI, V8DI, INT) +DEF_FUNCTION_TYPE (V4DI, V8DI, INT, V4DI, QI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V8DI, INT, V8DI, QI) DEF_FUNCTION_TYPE (V4DI, V4DI, INT) DEF_FUNCTION_TYPE (V2DI, V4DI, INT) DEF_FUNCTION_TYPE (VOID, PVOID, INT64) @@ -362,8 +464,10 @@ DEF_FUNCTION_TYPE (VOID, PCHAR, V16QI) DEF_FUNCTION_TYPE (VOID, PCHAR, V32QI) DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF) DEF_FUNCTION_TYPE (VOID, PDOUBLE, V4DF) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, V8DF) DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF) DEF_FUNCTION_TYPE (VOID, PFLOAT, V8SF) +DEF_FUNCTION_TYPE (VOID, PFLOAT, V16SF) DEF_FUNCTION_TYPE (VOID, PINT, INT) DEF_FUNCTION_TYPE (VOID, PLONGLONG, LONGLONG) DEF_FUNCTION_TYPE (VOID, PULONGLONG, ULONGLONG) @@ -374,6 +478,34 @@ DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI) DEF_FUNCTION_TYPE (VOID, PV4SF, V4SF) DEF_FUNCTION_TYPE (VOID, PV8SF, V8SF) DEF_FUNCTION_TYPE (VOID, UNSIGNED, UNSIGNED) +DEF_FUNCTION_TYPE (VOID, PV8DI, V8DI) + +# Instructions returning mask +DEF_FUNCTION_TYPE (HI, HI) +DEF_FUNCTION_TYPE (HI, HI, HI) +DEF_FUNCTION_TYPE (HI, HI, INT) +DEF_FUNCTION_TYPE (QI, V8DI, V8DI) +DEF_FUNCTION_TYPE (QI, V8DI, V8DI, QI) +DEF_FUNCTION_TYPE (HI, V16SI, V16SI) +DEF_FUNCTION_TYPE (HI, V16SI, V16SI, HI) +DEF_FUNCTION_TYPE (QI, V8DI, V8DI, INT) +DEF_FUNCTION_TYPE (QI, V8DI, V8DI, INT, QI) +DEF_FUNCTION_TYPE (HI, V16SI, V16SI, INT) +DEF_FUNCTION_TYPE (HI, V16SI, V16SI, INT ,HI) +DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT) +DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT, QI) +DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT, QI, INT) +DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT) +DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT, HI) +DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT, HI, INT) +DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT) +DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT, QI) +DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT, QI, INT) +DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT) +DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT, QI) +DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT, QI, INT) +DEF_FUNCTION_TYPE (V16SI, HI) +DEF_FUNCTION_TYPE (V8DI, QI) DEF_FUNCTION_TYPE (INT, V16QI, V16QI, INT) DEF_FUNCTION_TYPE (UCHAR, UINT, UINT, UINT) @@ -384,6 +516,7 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, INT) DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, V16QI) DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, DI, INT) @@ -399,6 +532,9 @@ DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI) DEF_FUNCTION_TYPE (V4HI, V4HI, HI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, FLOAT, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, SI, INT) @@ -413,11 +549,69 @@ DEF_FUNCTION_TYPE (V8SF, V8SF, V4SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SF) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF) DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI) DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI, INT) + +# Instructions with masking +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8SF, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8SI, V8DF, QI) +DEF_FUNCTION_TYPE (V8DI, V8SI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V8HI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V16QI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DF, V8DI, V8DF, V8DF) +DEF_FUNCTION_TYPE (V8DF, V8DI, V8DF, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF, QI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, HI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, HI) +DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, HI) +DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, V16SF) +DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, V16SF, HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI, V16SF, HI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI) +DEF_FUNCTION_TYPE (V16SF, V4SF, V16SF, HI) +DEF_FUNCTION_TYPE (V8DF, V4DF, V8DF, QI) +DEF_FUNCTION_TYPE (V8DF, V2DF, V8DF, QI) +DEF_FUNCTION_TYPE (V16SI, V4SI, V16SI, HI) +DEF_FUNCTION_TYPE (V16SI, SI, V16SI, HI) +DEF_FUNCTION_TYPE (V16SI, V16HI, V16SI, HI) +DEF_FUNCTION_TYPE (V16SI, V16QI, V16SI, HI) +DEF_FUNCTION_TYPE (V8SI, V8DF, V8SI, QI) +DEF_FUNCTION_TYPE (V8DI, V4DI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, V2DI, V8DI, QI) +DEF_FUNCTION_TYPE (V8DI, DI, V8DI, QI) +DEF_FUNCTION_TYPE (V16SF, PCV16SF, V16SF, HI) +DEF_FUNCTION_TYPE (V8DF, PCV8DF, V8DF, QI) +DEF_FUNCTION_TYPE (V16SI, PCV16SI, V16SI, HI) +DEF_FUNCTION_TYPE (V8DI, PCV8DI, V8DI, QI) +DEF_FUNCTION_TYPE (V2DF, PCDOUBLE, V2DF, QI) +DEF_FUNCTION_TYPE (V4SF, PCFLOAT, V4SF, QI) +DEF_FUNCTION_TYPE (V16QI, V16SI, V16QI, HI) +DEF_FUNCTION_TYPE (V16HI, V16SI, V16HI, HI) +DEF_FUNCTION_TYPE (V8SI, V8DI, V8SI, QI) +DEF_FUNCTION_TYPE (V8HI, V8DI, V8HI, QI) +DEF_FUNCTION_TYPE (V16QI, V8DI, V16QI, QI) +DEF_FUNCTION_TYPE (VOID, PV8DF, V8DF, QI) +DEF_FUNCTION_TYPE (VOID, PV16SF, V16SF, HI) +DEF_FUNCTION_TYPE (VOID, PV8DI, V8DI, QI) +DEF_FUNCTION_TYPE (VOID, PV16SI, V16SI, HI) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF, QI) +DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF, QI) +DEF_FUNCTION_TYPE (V16SI, V16SF, V16SI, HI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI, INT, QI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, INT, HI) + DEF_FUNCTION_TYPE (VOID, PCVOID, UNSIGNED, UNSIGNED) DEF_FUNCTION_TYPE (VOID, PV2DF, V2DI, V2DF) DEF_FUNCTION_TYPE (VOID, PV4DF, V4DI, V4DF) @@ -439,6 +633,13 @@ DEF_FUNCTION_TYPE (V8UHI, V8UHI, V8UHI, V8UHI) DEF_FUNCTION_TYPE (V16UQI, V16UQI, V16UQI, V16UQI) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI) +DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI) +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI, V8DF) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI, V16SF) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, V4SF, QI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, V2DF, QI) +DEF_FUNCTION_TYPE (V8DI, V16SI, V16SI, V8DI, QI) DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, UINT, UINT) DEF_FUNCTION_TYPE (V4HI, HI, HI, HI, HI) @@ -451,6 +652,44 @@ DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI) DEF_FUNCTION_TYPE (UCHAR, UCHAR, UINT, UINT, PUNSIGNED) DEF_FUNCTION_TYPE (UCHAR, UCHAR, ULONGLONG, ULONGLONG, PULONGLONG) +# Instructions with rounding +DEF_FUNCTION_TYPE (UINT64, V2DF, INT) +DEF_FUNCTION_TYPE (UINT64, V4SF, INT) +DEF_FUNCTION_TYPE (UINT, V2DF, INT) +DEF_FUNCTION_TYPE (UINT, V4SF, INT) +DEF_FUNCTION_TYPE (INT64, V2DF, INT) +DEF_FUNCTION_TYPE (INT64, V4SF, INT) +DEF_FUNCTION_TYPE (INT, V2DF, INT) +DEF_FUNCTION_TYPE (INT, V4SF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, UINT64, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, UINT64, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, UINT, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, INT64, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, INT64, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, INT, INT) +DEF_FUNCTION_TYPE (V16SI, V16SF, V16SI, HI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, HI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, HI, INT) +DEF_FUNCTION_TYPE (V16SF, V16HI, V16SF, HI, INT) +DEF_FUNCTION_TYPE (V8SI, V8DF, V8SI, QI, INT) +DEF_FUNCTION_TYPE (V8SF, V8DF, V8SF, QI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, QI, INT) +DEF_FUNCTION_TYPE (V8DF, V8SF, V8DF, QI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, HI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF, QI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, INT) + +DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, HI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, QI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, V4SF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, V2DF, QI, INT) +DEF_FUNCTION_TYPE (V8DI, V8DI, SI, V8DI, V8DI) + DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT) @@ -472,6 +711,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, PCFLOAT, V16SI, HI, INT) +DEF_FUNCTION_TYPE (V16SF, V16SF, PCFLOAT, V8DI, HI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V8SI, QI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V16SI, QI, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8DI, QI, INT) +DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V8DI, QI, INT) +DEF_FUNCTION_TYPE (V16SI, V16SI, PCINT, V16SI, HI, INT) +DEF_FUNCTION_TYPE (V16SI, V16SI, PCINT, V8DI, HI, INT) +DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V8SI, QI, INT) +DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V16SI, QI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8DI, QI, INT) +DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V8DI, QI, INT) +DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V16SI, V16SF, INT) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V8SI, V8DF, INT) +DEF_FUNCTION_TYPE (VOID, PFLOAT, QI, V8DI, V8SF, INT) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V8DI, V8DF, INT) +DEF_FUNCTION_TYPE (VOID, PINT, HI, V16SI, V16SI, INT) +DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8SI, V8DI, INT) +DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT) +DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT) + +DEF_FUNCTION_TYPE (VOID, HI, V16SI, PCINT, INT, INT) +DEF_FUNCTION_TYPE (VOID, QI, V8DI, PCINT, INT, INT) + DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND) @@ -479,6 +742,7 @@ DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V2DF_V2DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V4DF_V4DF, ROUND) +DEF_FUNCTION_TYPE_ALIAS (V16SI_FTYPE_V8DF_V8DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SF, ROUND) diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index ff1a17a9d11..9686382af48 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -1,5 +1,5 @@ /* Subroutines used for macro/preprocessor support on the ia-32. - Copyright (C) 2008-2013 Free Software Foundation, Inc. + Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -141,25 +141,35 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__core2"); def_or_undef (parse_in, "__core2__"); break; - case PROCESSOR_COREI7: + case PROCESSOR_NEHALEM: def_or_undef (parse_in, "__corei7"); def_or_undef (parse_in, "__corei7__"); + def_or_undef (parse_in, "__nehalem"); + def_or_undef (parse_in, "__nehalem__"); break; - case PROCESSOR_COREI7_AVX: + case PROCESSOR_SANDYBRIDGE: def_or_undef (parse_in, "__corei7_avx"); def_or_undef (parse_in, "__corei7_avx__"); + def_or_undef (parse_in, "__sandybridge"); + def_or_undef (parse_in, "__sandybridge__"); break; case PROCESSOR_HASWELL: def_or_undef (parse_in, "__core_avx2"); def_or_undef (parse_in, "__core_avx2__"); + def_or_undef (parse_in, "__haswell"); + def_or_undef (parse_in, "__haswell__"); break; - case PROCESSOR_ATOM: + case PROCESSOR_BONNELL: def_or_undef (parse_in, "__atom"); def_or_undef (parse_in, "__atom__"); + def_or_undef (parse_in, "__bonnell"); + def_or_undef (parse_in, "__bonnell__"); break; - case PROCESSOR_SLM: + case PROCESSOR_SILVERMONT: def_or_undef (parse_in, "__slm"); def_or_undef (parse_in, "__slm__"); + def_or_undef (parse_in, "__silvermont"); + def_or_undef (parse_in, "__silvermont__"); break; /* use PROCESSOR_max to not set/unset the arch macro. */ case PROCESSOR_max: @@ -246,20 +256,25 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_CORE2: def_or_undef (parse_in, "__tune_core2__"); break; - case PROCESSOR_COREI7: + case PROCESSOR_NEHALEM: def_or_undef (parse_in, "__tune_corei7__"); + def_or_undef (parse_in, "__tune_nehalem__"); break; - case PROCESSOR_COREI7_AVX: + case PROCESSOR_SANDYBRIDGE: def_or_undef (parse_in, "__tune_corei7_avx__"); + def_or_undef (parse_in, "__tune_sandybridge__"); break; case PROCESSOR_HASWELL: def_or_undef (parse_in, "__tune_core_avx2__"); + def_or_undef (parse_in, "__tune_haswell__"); break; - case PROCESSOR_ATOM: + case PROCESSOR_BONNELL: def_or_undef (parse_in, "__tune_atom__"); + def_or_undef (parse_in, "__tune_bonnell__"); break; - case PROCESSOR_SLM: + case PROCESSOR_SILVERMONT: def_or_undef (parse_in, "__tune_slm__"); + def_or_undef (parse_in, "__tune_silvermont__"); break; case PROCESSOR_GENERIC: break; @@ -312,6 +327,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__SSE4_2__"); if (isa_flag & OPTION_MASK_ISA_AES) def_or_undef (parse_in, "__AES__"); + if (isa_flag & OPTION_MASK_ISA_SHA) + def_or_undef (parse_in, "__SHA__"); if (isa_flag & OPTION_MASK_ISA_PCLMUL) def_or_undef (parse_in, "__PCLMUL__"); if (isa_flag & OPTION_MASK_ISA_AVX) diff --git a/gcc/config/i386/i386-interix.h b/gcc/config/i386/i386-interix.h index b99f4d9b908..9f8f7c64362 100644 --- a/gcc/config/i386/i386-interix.h +++ b/gcc/config/i386/i386-interix.h @@ -1,5 +1,5 @@ /* Target definitions for GCC for Intel 80386 running Interix - Parts Copyright (C) 1991-2013 Free Software Foundation, Inc. + Parts Copyright (C) 1991-2014 Free Software Foundation, Inc. Parts: by Douglas B. Rupp (drupp@cs.washington.edu). diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 57d08fb1b68..07e572058cc 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -1,5 +1,5 @@ /* Definitions of target machine for GCC for IA-32. - Copyright (C) 2002-2013 Free Software Foundation, Inc. + Copyright (C) 2002-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index 5fcbd6b5776..47a34dbf781 100644 --- a/gcc/config/i386/i386-opts.h +++ b/gcc/config/i386/i386-opts.h @@ -1,5 +1,5 @@ /* Definitions for option handling for IA-32. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 73feef25144..803a130e320 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -1,5 +1,5 @@ /* Definitions of target machine for GCC for IA-32. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2aaca7b9846..1d04a3c64ee 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,5 +1,5 @@ /* Subroutines used for code generation on IA-32. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see #include "is-a.h" #include "gimple.h" #include "gimplify.h" +#include "cfgloop.h" #include "dwarf2.h" #include "df.h" #include "tm-constrs.h" @@ -1935,12 +1936,12 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_NOCONA (1<<PROCESSOR_NOCONA) #define m_P4_NOCONA (m_PENT4 | m_NOCONA) #define m_CORE2 (1<<PROCESSOR_CORE2) -#define m_COREI7 (1<<PROCESSOR_COREI7) -#define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX) +#define m_NEHALEM (1<<PROCESSOR_NEHALEM) +#define m_SANDYBRIDGE (1<<PROCESSOR_SANDYBRIDGE) #define m_HASWELL (1<<PROCESSOR_HASWELL) -#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL) -#define m_ATOM (1<<PROCESSOR_ATOM) -#define m_SLM (1<<PROCESSOR_SLM) +#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) +#define m_BONNELL (1<<PROCESSOR_BONNELL) +#define m_SILVERMONT (1<<PROCESSOR_SILVERMONT) #define m_GEODE (1<<PROCESSOR_GEODE) #define m_K6 (1<<PROCESSOR_K6) @@ -2308,7 +2309,7 @@ enum x86_64_reg_class X86_64_MEMORY_CLASS }; -#define MAX_CLASSES 4 +#define MAX_CLASSES 8 /* Table of constants used by fldpi, fldln2, etc.... */ static REAL_VALUE_TYPE ext_80387_constants_table [5]; @@ -2375,6 +2376,7 @@ static tree ix86_veclibabi_acml (enum built_in_function, tree, tree); /* Processor target table, indexed by processor number */ struct ptt { + const char *const name; /* processor name */ const struct processor_costs *cost; /* Processor costs */ const int align_loop; /* Default alignments. */ const int align_loop_max_skip; @@ -2383,73 +2385,33 @@ struct ptt const int align_func; }; +/* This table must be in sync with enum processor_type in i386.h. */ static const struct ptt processor_target_table[PROCESSOR_max] = { - {&i386_cost, 4, 3, 4, 3, 4}, - {&i486_cost, 16, 15, 16, 15, 16}, - {&pentium_cost, 16, 7, 16, 7, 16}, - {&pentiumpro_cost, 16, 15, 16, 10, 16}, - {&geode_cost, 0, 0, 0, 0, 0}, - {&k6_cost, 32, 7, 32, 7, 32}, - {&athlon_cost, 16, 7, 16, 7, 16}, - {&pentium4_cost, 0, 0, 0, 0, 0}, - {&k8_cost, 16, 7, 16, 7, 16}, - {&nocona_cost, 0, 0, 0, 0, 0}, - /* Core 2 */ - {&core_cost, 16, 10, 16, 10, 16}, - /* Core i7 */ - {&core_cost, 16, 10, 16, 10, 16}, - /* Core i7 avx */ - {&core_cost, 16, 10, 16, 10, 16}, - /* Core avx2 */ - {&core_cost, 16, 10, 16, 10, 16}, - {&generic_cost, 16, 10, 16, 10, 16}, - {&amdfam10_cost, 32, 24, 32, 7, 32}, - {&bdver1_cost, 16, 10, 16, 7, 11}, - {&bdver2_cost, 16, 10, 16, 7, 11}, - {&bdver3_cost, 16, 10, 16, 7, 11}, - {&bdver4_cost, 16, 10, 16, 7, 11}, - {&btver1_cost, 16, 10, 16, 7, 11}, - {&btver2_cost, 16, 10, 16, 7, 11}, - {&atom_cost, 16, 15, 16, 7, 16}, - {&slm_cost, 16, 15, 16, 7, 16} -}; - -static const char *const cpu_names[TARGET_CPU_DEFAULT_max] = -{ - "generic", - "i386", - "i486", - "pentium", - "pentium-mmx", - "pentiumpro", - "pentium2", - "pentium3", - "pentium4", - "pentium-m", - "prescott", - "nocona", - "core2", - "corei7", - "corei7-avx", - "core-avx2", - "atom", - "slm", - "intel", - "geode", - "k6", - "k6-2", - "k6-3", - "athlon", - "athlon-4", - "k8", - "amdfam10", - "bdver1", - "bdver2", - "bdver3", - "bdver4", - "btver1", - "btver2" + {"generic", &generic_cost, 16, 10, 16, 10, 16}, + {"i386", &i386_cost, 4, 3, 4, 3, 4}, + {"i486", &i486_cost, 16, 15, 16, 15, 16}, + {"pentium", &pentium_cost, 16, 7, 16, 7, 16}, + {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16}, + {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0}, + {"nocona", &nocona_cost, 0, 0, 0, 0, 0}, + {"core2", &core_cost, 16, 10, 16, 10, 16}, + {"nehalem", &core_cost, 16, 10, 16, 10, 16}, + {"sandybridge", &core_cost, 16, 10, 16, 10, 16}, + {"haswell", &core_cost, 16, 10, 16, 10, 16}, + {"bonnell", &atom_cost, 16, 15, 16, 7, 16}, + {"silvermont", &slm_cost, 16, 15, 16, 7, 16}, + {"geode", &geode_cost, 0, 0, 0, 0, 0}, + {"k6", &k6_cost, 32, 7, 32, 7, 32}, + {"athlon", &athlon_cost, 16, 7, 16, 7, 16}, + {"k8", &k8_cost, 16, 7, 16, 7, 16}, + {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32}, + {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11}, + {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11}, + {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11}, + {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11}, + {"btver1", &btver1_cost, 16, 10, 16, 7, 11}, + {"btver2", &btver2_cost, 16, 10, 16, 7, 11} }; static bool @@ -2573,6 +2535,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch, { "-mmovbe", OPTION_MASK_ISA_MOVBE }, { "-mcrc32", OPTION_MASK_ISA_CRC32 }, { "-maes", OPTION_MASK_ISA_AES }, + { "-msha", OPTION_MASK_ISA_SHA }, { "-mpclmul", OPTION_MASK_ISA_PCLMUL }, { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE }, { "-mrdrnd", OPTION_MASK_ISA_RDRND }, @@ -2857,7 +2820,6 @@ ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) do { int maxs; - stringop_alg alg; char alg_name[128]; char align[16]; next_range_str = strchr (curr_range_str, ','); @@ -2880,13 +2842,8 @@ ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) } for (i = 0; i < last_alg; i++) - { - if (!strcmp (alg_name, stringop_alg_names[i])) - { - alg = (stringop_alg) i; - break; - } - } + if (!strcmp (alg_name, stringop_alg_names[i])) + break; if (i == last_alg) { @@ -2897,7 +2854,7 @@ ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) } input_ranges[n].max = maxs; - input_ranges[n].alg = alg; + input_ranges[n].alg = (stringop_alg) i; if (!strcmp (align, "align")) input_ranges[n].noalign = false; else if (!strcmp (align, "noalign")) @@ -3074,6 +3031,28 @@ ix86_option_override_internal (bool main_args_p, #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41) #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42) #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43) +#define PTA_SHA (HOST_WIDE_INT_1 << 45) + +#define PTA_CORE2 \ + (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \ + | PTA_CX16 | PTA_FXSR) +#define PTA_NEHALEM \ + (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT) +#define PTA_WESTMERE \ + (PTA_NEHALEM | PTA_AES | PTA_PCLMUL) +#define PTA_SANDYBRIDGE \ + (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT) +#define PTA_IVYBRIDGE \ + (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C) +#define PTA_HASWELL \ + (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \ + | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE) +#define PTA_BROADWELL \ + (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED) +#define PTA_BONNELL \ + (PTA_CORE2 | PTA_MOVBE) +#define PTA_SILVERMONT \ + (PTA_WESTMERE | PTA_MOVBE) /* if this reaches 64, need to widen struct pta flags below */ @@ -3114,39 +3093,26 @@ ix86_option_override_internal (bool main_args_p, {"nocona", PROCESSOR_NOCONA, CPU_NONE, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR}, - {"core2", PROCESSOR_CORE2, CPU_CORE2, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_CX16 | PTA_FXSR}, - {"corei7", PROCESSOR_COREI7, CPU_COREI7, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 - | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR}, - {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX - | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL - | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT}, - {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX - | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE - | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT}, - {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2 - | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE - | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT - | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE - | PTA_XSAVEOPT}, - {"atom", PROCESSOR_ATOM, CPU_ATOM, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR}, - {"slm", PROCESSOR_SLM, CPU_SLM, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 - | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_AES - | PTA_PCLMUL | PTA_RDRND | PTA_MOVBE | PTA_FXSR}, - {"intel", PROCESSOR_SLM, CPU_SLM, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 - | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR}, + {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2}, + {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM}, + {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM}, + {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE}, + {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM, + PTA_SANDYBRIDGE}, + {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM, + PTA_SANDYBRIDGE}, + {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM, + PTA_IVYBRIDGE}, + {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM, + PTA_IVYBRIDGE}, + {"haswell", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_HASWELL}, + {"core-avx2", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_HASWELL}, + {"broadwell", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_BROADWELL}, + {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL}, + {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL}, + {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT}, + {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT}, + {"intel", PROCESSOR_SILVERMONT, CPU_SLM, PTA_NEHALEM}, {"geode", PROCESSOR_GEODE, CPU_GEODE, PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW}, {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX}, @@ -3320,23 +3286,13 @@ ix86_option_override_internal (bool main_args_p, /* Need to check -mtune=generic first. */ if (opts->x_ix86_tune_string) { - if (!strcmp (opts->x_ix86_tune_string, "generic") - || !strcmp (opts->x_ix86_tune_string, "i686") - /* As special support for cross compilers we read -mtune=native + /* As special support for cross compilers we read -mtune=native as -mtune=generic. With native compilers we won't see the -mtune=native, as it was changed by the driver. */ - || !strcmp (opts->x_ix86_tune_string, "native")) + if (!strcmp (opts->x_ix86_tune_string, "native")) { opts->x_ix86_tune_string = "generic"; } - /* If this call is for setting the option attribute, allow the - generic that was previously set. */ - else if (!main_args_p - && !strcmp (opts->x_ix86_tune_string, "generic")) - ; - else if (!strncmp (opts->x_ix86_tune_string, "generic", 7)) - error ("bad value (%s) for %stune=%s %s", - opts->x_ix86_tune_string, prefix, suffix, sw); else if (!strcmp (opts->x_ix86_tune_string, "x86-64")) warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use " "%stune=k8%s or %stune=generic%s instead as appropriate", @@ -3348,15 +3304,14 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_tune_string = opts->x_ix86_arch_string; if (!opts->x_ix86_tune_string) { - opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT]; + opts->x_ix86_tune_string + = processor_target_table[TARGET_CPU_DEFAULT].name; ix86_tune_defaulted = 1; } /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string or defaulted. We need to use a sensible tune option. */ - if (!strcmp (opts->x_ix86_tune_string, "generic") - || !strcmp (opts->x_ix86_tune_string, "x86-64") - || !strcmp (opts->x_ix86_tune_string, "i686")) + if (!strcmp (opts->x_ix86_tune_string, "x86-64")) { opts->x_ix86_tune_string = "generic"; } @@ -3574,8 +3529,11 @@ ix86_option_override_internal (bool main_args_p, && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE)) opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE; if (processor_alias_table[i].flags & PTA_AES - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES; + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) + ix86_isa_flags |= OPTION_MASK_ISA_AES; + if (processor_alias_table[i].flags & PTA_SHA + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA)) + ix86_isa_flags |= OPTION_MASK_ISA_SHA; if (processor_alias_table[i].flags & PTA_PCLMUL && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; @@ -3633,10 +3591,10 @@ ix86_option_override_internal (bool main_args_p, if (!strcmp (opts->x_ix86_arch_string, "generic")) error ("generic CPU can be used only for %stune=%s %s", prefix, suffix, sw); - else if (!strcmp (ix86_arch_string, "intel")) + else if (!strcmp (opts->x_ix86_arch_string, "intel")) error ("intel CPU can be used only for %stune=%s %s", prefix, suffix, sw); - else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size) + else if (i == pta_size) error ("bad value (%s) for %sarch=%s %s", opts->x_ix86_arch_string, prefix, suffix, sw); @@ -4399,19 +4357,15 @@ ix86_function_specific_print (FILE *file, int indent, = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags, NULL, NULL, ptr->x_ix86_fpmath, false); + gcc_assert (ptr->arch < PROCESSOR_max); fprintf (file, "%*sarch = %d (%s)\n", indent, "", - ptr->arch, - ((ptr->arch < TARGET_CPU_DEFAULT_max) - ? cpu_names[ptr->arch] - : "<unknown>")); + ptr->arch, processor_target_table[ptr->arch].name); + gcc_assert (ptr->tune < PROCESSOR_max); fprintf (file, "%*stune = %d (%s)\n", indent, "", - ptr->tune, - ((ptr->tune < TARGET_CPU_DEFAULT_max) - ? cpu_names[ptr->tune] - : "<unknown>")); + ptr->tune, processor_target_table[ptr->tune].name); fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost); @@ -4468,6 +4422,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt), IX86_ATTR_ISA ("tbm", OPT_mtbm), IX86_ATTR_ISA ("aes", OPT_maes), + IX86_ATTR_ISA ("sha", OPT_msha), IX86_ATTR_ISA ("avx", OPT_mavx), IX86_ATTR_ISA ("avx2", OPT_mavx2), IX86_ATTR_ISA ("avx512f", OPT_mavx512f), @@ -6294,7 +6249,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) sized containers, classes[0] will be NO_CLASS and 1 is returned. BIT_OFFSET is used internally for handling records and specifies offset - of the offset in bits modulo 256 to avoid overflow cases. + of the offset in bits modulo 512 to avoid overflow cases. See the x86-64 PS ABI for details. */ @@ -6394,7 +6349,7 @@ classify_argument (enum machine_mode mode, const_tree type, num = classify_argument (TYPE_MODE (type), type, subclasses, (int_bit_position (field) - + bit_offset) % 256); + + bit_offset) % 512); if (!num) return 0; pos = (int_bit_position (field) @@ -6644,6 +6599,21 @@ classify_argument (enum machine_mode mode, const_tree type, classes[2] = X86_64_SSEUP_CLASS; classes[3] = X86_64_SSEUP_CLASS; return 4; + case V8DFmode: + case V16SFmode: + case V8DImode: + case V16SImode: + case V32HImode: + case V64QImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + classes[2] = X86_64_SSEUP_CLASS; + classes[3] = X86_64_SSEUP_CLASS; + classes[4] = X86_64_SSEUP_CLASS; + classes[5] = X86_64_SSEUP_CLASS; + classes[6] = X86_64_SSEUP_CLASS; + classes[7] = X86_64_SSEUP_CLASS; + return 8; case V4SFmode: case V4SImode: case V16QImode: @@ -6829,6 +6799,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, && mode != BLKmode) return gen_reg_or_parallel (mode, orig_mode, SSE_REGNO (sse_regno)); + if (n == 8 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + SSE_REGNO (sse_regno)); if (n == 2 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS) @@ -6910,6 +6892,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, tmpmode = OImode; i += 3; break; + case 8: + gcc_assert (i == 0 + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS); + tmpmode = XImode; + i += 7; + break; default: gcc_unreachable (); } @@ -6983,6 +6977,12 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V8SFmode: case V8SImode: + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: case V32QImode: case V16HImode: case V4DFmode: @@ -7034,8 +7034,9 @@ function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, { int int_nregs, sse_nregs; - /* Unnamed 256bit vector mode parameters are passed on stack. */ - if (!named && VALID_AVX256_REG_MODE (mode)) + /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ + if (!named && (VALID_AVX512F_REG_MODE (mode) + || VALID_AVX256_REG_MODE (mode))) return; if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs) @@ -7186,9 +7187,16 @@ function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, break; case OImode: - /* OImode shouldn't be used directly. */ + case XImode: + /* OImode and XImode shouldn't be used directly. */ gcc_unreachable (); + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: case V8SFmode: case V8SImode: case V32QImode: @@ -7251,7 +7259,13 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, case V16HImode: case V4DFmode: case V4DImode: - /* Unnamed 256bit vector mode parameters are passed on stack. */ + case V16SFmode: + case V16SImode: + case V64QImode: + case V32HImode: + case V8DFmode: + case V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ if (!named) return NULL; break; @@ -7654,6 +7668,10 @@ function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32) regno = FIRST_SSE_REG; + /* 64-byte vector modes in %zmm0. */ + else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + regno = FIRST_SSE_REG; + /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) regno = FIRST_FLOAT_REG; @@ -7861,6 +7879,10 @@ return_in_memory_32 (const_tree type, enum machine_mode mode) /* AVX values are returned in YMM0, except when it doesn't exist. */ if (size == 32) return !TARGET_AVX; + + /* AVX512F values are returned in ZMM0, except when it doesn't exist. */ + if (size == 64) + return !TARGET_AVX512F; } if (mode == XFmode) @@ -8397,7 +8419,13 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, case V16HImode: case V4DFmode: case V4DImode: - /* Unnamed 256bit vector mode parameters are passed on stack. */ + case V16SFmode: + case V16SImode: + case V64QImode: + case V32HImode: + case V8DFmode: + case V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ if (!TARGET_64BIT_MS_ABI) { container = NULL; @@ -8812,6 +8840,12 @@ standard_sse_constant_p (rtx x) case V4DImode: if (TARGET_AVX2) return 2; + case V64QImode: + case V32HImode: + case V16SImode: + case V8DImode: + if (TARGET_AVX512F) + return 2; default: break; } @@ -8830,6 +8864,11 @@ standard_sse_constant_opcode (rtx insn, rtx x) case 1: switch (get_attr_mode (insn)) { + case MODE_XI: + case MODE_V16SF: + return "vpxord\t%g0, %g0, %g0"; + case MODE_V8DF: + return "vpxorq\t%g0, %g0, %g0"; case MODE_TI: return "%vpxor\t%0, %d0"; case MODE_V2DF: @@ -9241,7 +9280,9 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return) } } - if (crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + if (crtl->drap_reg + && regno == REGNO (crtl->drap_reg) + && crtl->stack_realign_needed) return true; return (df_regs_ever_live_p (regno) @@ -10479,12 +10520,23 @@ ix86_finalize_stack_realign_flags (void) return; } + /* If drap has been set, but it actually isn't live at the start + of the function and !stack_realign, there is no reason to set it up. */ + if (crtl->drap_reg && !stack_realign) + { + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg))) + { + crtl->drap_reg = NULL_RTX; + crtl->need_drap = false; + } + } + /* If the only reason for frame_pointer_needed is that we conservatively assumed stack realignment might be needed, but in the end nothing that needed the stack alignment had been spilled, clear frame_pointer_needed and say we don't need stack realignment. */ if (stack_realign - && !crtl->need_drap && frame_pointer_needed && crtl->is_leaf && flag_omit_frame_pointer @@ -10522,6 +10574,18 @@ ix86_finalize_stack_realign_flags (void) } } + /* If drap has been set, but it actually isn't live at the start + of the function, there is no reason to set it up. */ + if (crtl->drap_reg) + { + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg))) + { + crtl->drap_reg = NULL_RTX; + crtl->need_drap = false; + } + } + frame_pointer_needed = false; stack_realign = false; crtl->max_used_stack_slot_alignment = incoming_stack_boundary; @@ -15042,6 +15106,38 @@ ix86_print_operand (FILE *file, rtx x, int code) fputs ("{z}", file); return; + case 'R': + gcc_assert (CONST_INT_P (x)); + + if (ASSEMBLER_DIALECT == ASM_INTEL) + fputs (", ", file); + + switch (INTVAL (x)) + { + case ROUND_NEAREST_INT: + fputs ("{rn-sae}", file); + break; + case ROUND_NEG_INF: + fputs ("{rd-sae}", file); + break; + case ROUND_POS_INF: + fputs ("{ru-sae}", file); + break; + case ROUND_ZERO: + fputs ("{rz-sae}", file); + break; + case ROUND_SAE: + fputs ("{sae}", file); + break; + default: + gcc_unreachable (); + } + + if (ASSEMBLER_DIALECT == ASM_ATT) + fputs (", ", file); + + return; + case '*': if (ASSEMBLER_DIALECT == ASM_ATT) putc ('*', file); @@ -17878,7 +17974,7 @@ ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, /* For Silvermont if using a 2-source or 3-source LEA for non-destructive destination purposes, or due to wanting ability to use SCALE, the use of LEA is justified. */ - if (ix86_tune == PROCESSOR_SLM) + if (ix86_tune == PROCESSOR_SILVERMONT) { if (has_scale) return true; @@ -18246,7 +18342,7 @@ ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode) /* Return true if it is ok to optimize an ADD operation to LEA operation to avoid flag register consumation. For most processors, - ADD is faster than LEA. For the processors like ATOM, if the + ADD is faster than LEA. For the processors like BONNELL, if the destination register of LEA holds an actual address which will be used soon, LEA is better and otherwise ADD is better. */ @@ -18688,17 +18784,23 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) switch (mode) { + case V64QImode: case V32QImode: case V16QImode: + case V32HImode: case V16HImode: case V8HImode: + case V16SImode: case V8SImode: case V4SImode: + case V8DImode: case V4DImode: case V2DImode: gcc_assert (vect); + case V16SFmode: case V8SFmode: case V4SFmode: + case V8DFmode: case V4DFmode: case V2DFmode: n_elt = GET_MODE_NUNITS (mode); @@ -18735,6 +18837,8 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) /* Find the sign bit, sign extended to 2*HWI. */ switch (mode) { + case V16SImode: + case V16SFmode: case V8SImode: case V4SImode: case V8SFmode: @@ -18745,8 +18849,10 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) lo = 0x80000000, hi = lo < 0; break; + case V8DImode: case V4DImode: case V2DImode: + case V8DFmode: case V4DFmode: case V2DFmode: vec_mode = mode; @@ -20603,22 +20709,63 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, rtx op_true, rtx op_false) { enum machine_mode mode = GET_MODE (dest); - enum machine_mode cmp_mode = GET_MODE (cmp_op0); + enum machine_mode cmp_ops_mode = GET_MODE (cmp_op0); + + /* In general case result of comparison can differ from operands' type. */ + enum machine_mode cmp_mode; + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = false; rtx x; - cmp_op0 = force_reg (cmp_mode, cmp_op0); - if (!nonimmediate_operand (cmp_op1, cmp_mode)) - cmp_op1 = force_reg (cmp_mode, cmp_op1); + if (GET_MODE_SIZE (cmp_ops_mode) == 64) + { + cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0); + gcc_assert (cmp_mode != BLKmode); + + maskcmp = true; + } + else + cmp_mode = cmp_ops_mode; + + + cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); + if (!nonimmediate_operand (cmp_op1, cmp_ops_mode)) + cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); if (optimize || reg_overlap_mentioned_p (dest, op_true) || reg_overlap_mentioned_p (dest, op_false)) - dest = gen_reg_rtx (mode); + dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); + /* Compare patterns for int modes are unspec in AVX512F only. */ + if (maskcmp && (code == GT || code == EQ)) + { + rtx (*gen)(rtx, rtx, rtx); + + switch (cmp_ops_mode) + { + case V16SImode: + gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1; + break; + case V8DImode: + gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1; + break; + default: + gen = NULL; + } + + if (gen) + { + emit_insn (gen (dest, cmp_op0, cmp_op1)); + return dest; + } + } x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); - if (cmp_mode != mode) + + if (cmp_mode != mode && !maskcmp) { - x = force_reg (cmp_mode, x); + x = force_reg (cmp_ops_mode, x); convert_move (dest, x, false); } else @@ -20634,33 +20781,43 @@ static void ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) { enum machine_mode mode = GET_MODE (dest); + enum machine_mode cmpmode = GET_MODE (cmp); + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = (mode != cmpmode && TARGET_AVX512F); + rtx t2, t3, x; if (vector_all_ones_operand (op_true, mode) - && rtx_equal_p (op_false, CONST0_RTX (mode))) + && rtx_equal_p (op_false, CONST0_RTX (mode)) + && !maskcmp) { emit_insn (gen_rtx_SET (VOIDmode, dest, cmp)); } - else if (op_false == CONST0_RTX (mode)) + else if (op_false == CONST0_RTX (mode) + && !maskcmp) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (op_true == CONST0_RTX (mode)) + else if (op_true == CONST0_RTX (mode) + && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_NOT (mode, cmp); x = gen_rtx_AND (mode, x, op_false); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) + else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) + && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_IOR (mode, cmp, op_false); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } - else if (TARGET_XOP) + else if (TARGET_XOP + && !maskcmp) { op_true = force_reg (mode, op_true); @@ -20728,6 +20885,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) cmp = gen_lowpart (V32QImode, cmp); } break; + + case V16SImode: + gen = gen_avx512f_blendmv16si; + break; + case V8DImode: + gen = gen_avx512f_blendmv8di; + break; + case V8DFmode: + gen = gen_avx512f_blendmv8df; + break; + case V16SFmode: + gen = gen_avx512f_blendmv16sf; + break; + default: break; } @@ -20995,6 +21166,8 @@ ix86_expand_int_vcond (rtx operands[]) switch (mode) { + case V16SImode: + case V8DImode: case V8SImode: case V4DImode: case V4SImode: @@ -21005,6 +21178,8 @@ ix86_expand_int_vcond (rtx operands[]) switch (mode) { + case V16SImode: gen_sub3 = gen_subv16si3; break; + case V8DImode: gen_sub3 = gen_subv8di3; break; case V8SImode: gen_sub3 = gen_subv8si3; break; case V4DImode: gen_sub3 = gen_subv4di3; break; case V4SImode: gen_sub3 = gen_subv4si3; break; @@ -21060,7 +21235,8 @@ ix86_expand_int_vcond (rtx operands[]) gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, operands[1+negate], operands[2-negate]); - x = gen_lowpart (data_mode, x); + if (GET_MODE (x) == mode) + x = gen_lowpart (data_mode, x); } ix86_expand_sse_movcc (operands[0], x, operands[1+negate], @@ -21068,6 +21244,35 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +static bool +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + switch (mode) + { + case V16SImode: + emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, + force_reg (V16SImode, mask), + op1)); + return true; + case V16SFmode: + emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, + force_reg (V16SImode, mask), + op1)); + return true; + case V8DImode: + emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, + force_reg (V8DImode, mask), op1)); + return true; + case V8DFmode: + emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, + force_reg (V8DImode, mask), op1)); + return true; + default: + return false; + } +} + /* Expand a variable vector permutation. */ void @@ -21086,7 +21291,10 @@ ix86_expand_vec_perm (rtx operands[]) /* Number of elements in the vector. */ w = GET_MODE_NUNITS (mode); e = GET_MODE_UNIT_SIZE (mode); - gcc_assert (w <= 32); + gcc_assert (w <= 64); + + if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1)) + return; if (TARGET_AVX2) { @@ -21466,6 +21674,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) extract = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; break; + case V32HImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv16hiv16si2; + else + unpack = gen_avx512f_sign_extendv16hiv16si2; + halfmode = V16HImode; + extract + = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; + break; case V16HImode: if (unsigned_p) unpack = gen_avx2_zero_extendv8hiv8si2; @@ -21475,6 +21692,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) extract = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; break; + case V16SImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv8siv8di2; + else + unpack = gen_avx512f_sign_extendv8siv8di2; + halfmode = V8SImode; + extract + = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; + break; case V8SImode: if (unsigned_p) unpack = gen_avx2_zero_extendv4siv4di2; @@ -21506,7 +21732,7 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) gcc_unreachable (); } - if (GET_MODE_SIZE (imode) == 32) + if (GET_MODE_SIZE (imode) >= 32) { tmp = gen_reg_rtx (halfmode); emit_insn (extract (tmp, src)); @@ -24067,7 +24293,8 @@ ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, else { rtx hot_label = gen_label_rtx (); - jump_around_label = gen_label_rtx (); + if (jump_around_label == NULL_RTX) + jump_around_label = gen_label_rtx (); emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), LEU, 0, GET_MODE (count_exp), 1, hot_label); predict_jump (REG_BR_PROB_BASE * 90 / 100); @@ -25004,8 +25231,8 @@ ix86_issue_rate (void) switch (ix86_tune) { case PROCESSOR_PENTIUM: - case PROCESSOR_ATOM: - case PROCESSOR_SLM: + case PROCESSOR_BONNELL: + case PROCESSOR_SILVERMONT: case PROCESSOR_K6: case PROCESSOR_BTVER2: case PROCESSOR_PENTIUM4: @@ -25025,8 +25252,8 @@ ix86_issue_rate (void) case PROCESSOR_BDVER3: case PROCESSOR_BDVER4: case PROCESSOR_CORE2: - case PROCESSOR_COREI7: - case PROCESSOR_COREI7_AVX: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: return 4; @@ -25323,8 +25550,8 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) break; case PROCESSOR_CORE2: - case PROCESSOR_COREI7: - case PROCESSOR_COREI7_AVX: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: memory = get_attr_memory (insn); @@ -25346,7 +25573,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) } break; - case PROCESSOR_SLM: + case PROCESSOR_SILVERMONT: if (!reload_completed) return cost; @@ -25410,11 +25637,11 @@ ia32_multipass_dfa_lookahead (void) return 4; case PROCESSOR_CORE2: - case PROCESSOR_COREI7: - case PROCESSOR_COREI7_AVX: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: - case PROCESSOR_ATOM: - case PROCESSOR_SLM: + case PROCESSOR_BONNELL: + case PROCESSOR_SILVERMONT: /* Generally, we want haifa-sched:max_issue() to look ahead as far as many instructions can be executed on a cycle, i.e., issue_rate. I wonder why tuning for many CPUs does not do this. */ @@ -25556,7 +25783,7 @@ do_reorder_for_imul (rtx *ready, int n_ready) int index = -1; int i; - if (ix86_tune != PROCESSOR_ATOM) + if (ix86_tune != PROCESSOR_BONNELL) return index; /* Check that IMUL instruction is on the top of ready list. */ @@ -25636,7 +25863,7 @@ swap_top_of_ready_list (rtx *ready, int n_ready) int clock2 = -1; #define INSN_TICK(INSN) (HID (INSN)->tick) - if (ix86_tune != PROCESSOR_SLM) + if (ix86_tune != PROCESSOR_SILVERMONT) return false; if (!NONDEBUG_INSN_P (top)) @@ -25708,8 +25935,9 @@ ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, /* Set up issue rate. */ issue_rate = ix86_issue_rate (); - /* Do reodering for Atom/SLM only. */ - if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM) + /* Do reodering for BONNELL/SILVERMONT only. */ + if (ix86_tune != PROCESSOR_BONNELL + && ix86_tune != PROCESSOR_SILVERMONT) return issue_rate; /* Nothing to do if ready list contains only 1 instruction. */ @@ -26164,8 +26392,8 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED, switch (ix86_tune) { case PROCESSOR_CORE2: - case PROCESSOR_COREI7: - case PROCESSOR_COREI7_AVX: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: /* Do not perform multipass scheduling for pre-reload schedule to save compile time. */ @@ -26238,7 +26466,8 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align, bool opt) { - int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); + int max_align = optimize_size ? BITS_PER_WORD + : MIN (512, MAX_OFILE_ALIGNMENT); if (opt && AGGREGATE_TYPE_P (type) @@ -27699,12 +27928,412 @@ enum ix86_builtins IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI, - /* Alternate 4 element gather for the vectorizer where - all operands are 32-byte wide. */ + /* AVX512F */ + IX86_BUILTIN_ADDPD512, + IX86_BUILTIN_ADDPS512, + IX86_BUILTIN_ADDSD_ROUND, + IX86_BUILTIN_ADDSS_ROUND, + IX86_BUILTIN_ALIGND512, + IX86_BUILTIN_ALIGNQ512, + IX86_BUILTIN_BLENDMD512, + IX86_BUILTIN_BLENDMPD512, + IX86_BUILTIN_BLENDMPS512, + IX86_BUILTIN_BLENDMQ512, + IX86_BUILTIN_BROADCASTF32X4_512, + IX86_BUILTIN_BROADCASTF64X4_512, + IX86_BUILTIN_BROADCASTI32X4_512, + IX86_BUILTIN_BROADCASTI64X4_512, + IX86_BUILTIN_BROADCASTSD512, + IX86_BUILTIN_BROADCASTSS512, + IX86_BUILTIN_CMPD512, + IX86_BUILTIN_CMPPD512, + IX86_BUILTIN_CMPPS512, + IX86_BUILTIN_CMPQ512, + IX86_BUILTIN_CMPSD_MASK, + IX86_BUILTIN_CMPSS_MASK, + IX86_BUILTIN_COMIDF, + IX86_BUILTIN_COMISF, + IX86_BUILTIN_COMPRESSPD512, + IX86_BUILTIN_COMPRESSPDSTORE512, + IX86_BUILTIN_COMPRESSPS512, + IX86_BUILTIN_COMPRESSPSSTORE512, + IX86_BUILTIN_CVTDQ2PD512, + IX86_BUILTIN_CVTDQ2PS512, + IX86_BUILTIN_CVTPD2DQ512, + IX86_BUILTIN_CVTPD2PS512, + IX86_BUILTIN_CVTPD2UDQ512, + IX86_BUILTIN_CVTPH2PS512, + IX86_BUILTIN_CVTPS2DQ512, + IX86_BUILTIN_CVTPS2PD512, + IX86_BUILTIN_CVTPS2PH512, + IX86_BUILTIN_CVTPS2UDQ512, + IX86_BUILTIN_CVTSD2SS_ROUND, + IX86_BUILTIN_CVTSI2SD64, + IX86_BUILTIN_CVTSI2SS32, + IX86_BUILTIN_CVTSI2SS64, + IX86_BUILTIN_CVTSS2SD_ROUND, + IX86_BUILTIN_CVTTPD2DQ512, + IX86_BUILTIN_CVTTPD2UDQ512, + IX86_BUILTIN_CVTTPS2DQ512, + IX86_BUILTIN_CVTTPS2UDQ512, + IX86_BUILTIN_CVTUDQ2PD512, + IX86_BUILTIN_CVTUDQ2PS512, + IX86_BUILTIN_CVTUSI2SD32, + IX86_BUILTIN_CVTUSI2SD64, + IX86_BUILTIN_CVTUSI2SS32, + IX86_BUILTIN_CVTUSI2SS64, + IX86_BUILTIN_DIVPD512, + IX86_BUILTIN_DIVPS512, + IX86_BUILTIN_DIVSD_ROUND, + IX86_BUILTIN_DIVSS_ROUND, + IX86_BUILTIN_EXPANDPD512, + IX86_BUILTIN_EXPANDPD512Z, + IX86_BUILTIN_EXPANDPDLOAD512, + IX86_BUILTIN_EXPANDPDLOAD512Z, + IX86_BUILTIN_EXPANDPS512, + IX86_BUILTIN_EXPANDPS512Z, + IX86_BUILTIN_EXPANDPSLOAD512, + IX86_BUILTIN_EXPANDPSLOAD512Z, + IX86_BUILTIN_EXTRACTF32X4, + IX86_BUILTIN_EXTRACTF64X4, + IX86_BUILTIN_EXTRACTI32X4, + IX86_BUILTIN_EXTRACTI64X4, + IX86_BUILTIN_FIXUPIMMPD512_MASK, + IX86_BUILTIN_FIXUPIMMPD512_MASKZ, + IX86_BUILTIN_FIXUPIMMPS512_MASK, + IX86_BUILTIN_FIXUPIMMPS512_MASKZ, + IX86_BUILTIN_FIXUPIMMSD128_MASK, + IX86_BUILTIN_FIXUPIMMSD128_MASKZ, + IX86_BUILTIN_FIXUPIMMSS128_MASK, + IX86_BUILTIN_FIXUPIMMSS128_MASKZ, + IX86_BUILTIN_GETEXPPD512, + IX86_BUILTIN_GETEXPPS512, + IX86_BUILTIN_GETEXPSD128, + IX86_BUILTIN_GETEXPSS128, + IX86_BUILTIN_GETMANTPD512, + IX86_BUILTIN_GETMANTPS512, + IX86_BUILTIN_GETMANTSD128, + IX86_BUILTIN_GETMANTSS128, + IX86_BUILTIN_INSERTF32X4, + IX86_BUILTIN_INSERTF64X4, + IX86_BUILTIN_INSERTI32X4, + IX86_BUILTIN_INSERTI64X4, + IX86_BUILTIN_LOADAPD512, + IX86_BUILTIN_LOADAPS512, + IX86_BUILTIN_LOADDQUDI512, + IX86_BUILTIN_LOADDQUSI512, + IX86_BUILTIN_LOADUPD512, + IX86_BUILTIN_LOADUPS512, + IX86_BUILTIN_MAXPD512, + IX86_BUILTIN_MAXPS512, + IX86_BUILTIN_MAXSD_ROUND, + IX86_BUILTIN_MAXSS_ROUND, + IX86_BUILTIN_MINPD512, + IX86_BUILTIN_MINPS512, + IX86_BUILTIN_MINSD_ROUND, + IX86_BUILTIN_MINSS_ROUND, + IX86_BUILTIN_MOVAPD512, + IX86_BUILTIN_MOVAPS512, + IX86_BUILTIN_MOVDDUP512, + IX86_BUILTIN_MOVDQA32LOAD512, + IX86_BUILTIN_MOVDQA32STORE512, + IX86_BUILTIN_MOVDQA32_512, + IX86_BUILTIN_MOVDQA64LOAD512, + IX86_BUILTIN_MOVDQA64STORE512, + IX86_BUILTIN_MOVDQA64_512, + IX86_BUILTIN_MOVNTDQ512, + IX86_BUILTIN_MOVNTPD512, + IX86_BUILTIN_MOVNTPS512, + IX86_BUILTIN_MOVSHDUP512, + IX86_BUILTIN_MOVSLDUP512, + IX86_BUILTIN_MULPD512, + IX86_BUILTIN_MULPS512, + IX86_BUILTIN_MULSD_ROUND, + IX86_BUILTIN_MULSS_ROUND, + IX86_BUILTIN_PABSD512, + IX86_BUILTIN_PABSQ512, + IX86_BUILTIN_PADDD512, + IX86_BUILTIN_PADDQ512, + IX86_BUILTIN_PANDD512, + IX86_BUILTIN_PANDND512, + IX86_BUILTIN_PANDNQ512, + IX86_BUILTIN_PANDQ512, + IX86_BUILTIN_PBROADCASTD512, + IX86_BUILTIN_PBROADCASTD512_GPR, + IX86_BUILTIN_PBROADCASTMB512, + IX86_BUILTIN_PBROADCASTMW512, + IX86_BUILTIN_PBROADCASTQ512, + IX86_BUILTIN_PBROADCASTQ512_GPR, + IX86_BUILTIN_PBROADCASTQ512_MEM, + IX86_BUILTIN_PCMPEQD512_MASK, + IX86_BUILTIN_PCMPEQQ512_MASK, + IX86_BUILTIN_PCMPGTD512_MASK, + IX86_BUILTIN_PCMPGTQ512_MASK, + IX86_BUILTIN_PCOMPRESSD512, + IX86_BUILTIN_PCOMPRESSDSTORE512, + IX86_BUILTIN_PCOMPRESSQ512, + IX86_BUILTIN_PCOMPRESSQSTORE512, + IX86_BUILTIN_PEXPANDD512, + IX86_BUILTIN_PEXPANDD512Z, + IX86_BUILTIN_PEXPANDDLOAD512, + IX86_BUILTIN_PEXPANDDLOAD512Z, + IX86_BUILTIN_PEXPANDQ512, + IX86_BUILTIN_PEXPANDQ512Z, + IX86_BUILTIN_PEXPANDQLOAD512, + IX86_BUILTIN_PEXPANDQLOAD512Z, + IX86_BUILTIN_PMAXSD512, + IX86_BUILTIN_PMAXSQ512, + IX86_BUILTIN_PMAXUD512, + IX86_BUILTIN_PMAXUQ512, + IX86_BUILTIN_PMINSD512, + IX86_BUILTIN_PMINSQ512, + IX86_BUILTIN_PMINUD512, + IX86_BUILTIN_PMINUQ512, + IX86_BUILTIN_PMOVDB512, + IX86_BUILTIN_PMOVDW512, + IX86_BUILTIN_PMOVQB512, + IX86_BUILTIN_PMOVQD512, + IX86_BUILTIN_PMOVQW512, + IX86_BUILTIN_PMOVSDB512, + IX86_BUILTIN_PMOVSDW512, + IX86_BUILTIN_PMOVSQB512, + IX86_BUILTIN_PMOVSQD512, + IX86_BUILTIN_PMOVSQW512, + IX86_BUILTIN_PMOVSXBD512, + IX86_BUILTIN_PMOVSXBQ512, + IX86_BUILTIN_PMOVSXDQ512, + IX86_BUILTIN_PMOVSXWD512, + IX86_BUILTIN_PMOVSXWQ512, + IX86_BUILTIN_PMOVUSDB512, + IX86_BUILTIN_PMOVUSDW512, + IX86_BUILTIN_PMOVUSQB512, + IX86_BUILTIN_PMOVUSQD512, + IX86_BUILTIN_PMOVUSQW512, + IX86_BUILTIN_PMOVZXBD512, + IX86_BUILTIN_PMOVZXBQ512, + IX86_BUILTIN_PMOVZXDQ512, + IX86_BUILTIN_PMOVZXWD512, + IX86_BUILTIN_PMOVZXWQ512, + IX86_BUILTIN_PMULDQ512, + IX86_BUILTIN_PMULLD512, + IX86_BUILTIN_PMULUDQ512, + IX86_BUILTIN_PORD512, + IX86_BUILTIN_PORQ512, + IX86_BUILTIN_PROLD512, + IX86_BUILTIN_PROLQ512, + IX86_BUILTIN_PROLVD512, + IX86_BUILTIN_PROLVQ512, + IX86_BUILTIN_PRORD512, + IX86_BUILTIN_PRORQ512, + IX86_BUILTIN_PRORVD512, + IX86_BUILTIN_PRORVQ512, + IX86_BUILTIN_PSHUFD512, + IX86_BUILTIN_PSLLD512, + IX86_BUILTIN_PSLLDI512, + IX86_BUILTIN_PSLLQ512, + IX86_BUILTIN_PSLLQI512, + IX86_BUILTIN_PSLLVV16SI, + IX86_BUILTIN_PSLLVV8DI, + IX86_BUILTIN_PSRAD512, + IX86_BUILTIN_PSRADI512, + IX86_BUILTIN_PSRAQ512, + IX86_BUILTIN_PSRAQI512, + IX86_BUILTIN_PSRAVV16SI, + IX86_BUILTIN_PSRAVV8DI, + IX86_BUILTIN_PSRLD512, + IX86_BUILTIN_PSRLDI512, + IX86_BUILTIN_PSRLQ512, + IX86_BUILTIN_PSRLQI512, + IX86_BUILTIN_PSRLVV16SI, + IX86_BUILTIN_PSRLVV8DI, + IX86_BUILTIN_PSUBD512, + IX86_BUILTIN_PSUBQ512, + IX86_BUILTIN_PTESTMD512, + IX86_BUILTIN_PTESTMQ512, + IX86_BUILTIN_PTESTNMD512, + IX86_BUILTIN_PTESTNMQ512, + IX86_BUILTIN_PUNPCKHDQ512, + IX86_BUILTIN_PUNPCKHQDQ512, + IX86_BUILTIN_PUNPCKLDQ512, + IX86_BUILTIN_PUNPCKLQDQ512, + IX86_BUILTIN_PXORD512, + IX86_BUILTIN_PXORQ512, + IX86_BUILTIN_RCP14PD512, + IX86_BUILTIN_RCP14PS512, + IX86_BUILTIN_RCP14SD, + IX86_BUILTIN_RCP14SS, + IX86_BUILTIN_RNDSCALEPD, + IX86_BUILTIN_RNDSCALEPS, + IX86_BUILTIN_RNDSCALESD, + IX86_BUILTIN_RNDSCALESS, + IX86_BUILTIN_RSQRT14PD512, + IX86_BUILTIN_RSQRT14PS512, + IX86_BUILTIN_RSQRT14SD, + IX86_BUILTIN_RSQRT14SS, + IX86_BUILTIN_SCALEFPD512, + IX86_BUILTIN_SCALEFPS512, + IX86_BUILTIN_SCALEFSD, + IX86_BUILTIN_SCALEFSS, + IX86_BUILTIN_SHUFPD512, + IX86_BUILTIN_SHUFPS512, + IX86_BUILTIN_SHUF_F32x4, + IX86_BUILTIN_SHUF_F64x2, + IX86_BUILTIN_SHUF_I32x4, + IX86_BUILTIN_SHUF_I64x2, + IX86_BUILTIN_SQRTPD512, + IX86_BUILTIN_SQRTPD512_MASK, + IX86_BUILTIN_SQRTPS512_MASK, + IX86_BUILTIN_SQRTPS_NR512, + IX86_BUILTIN_SQRTSD_ROUND, + IX86_BUILTIN_SQRTSS_ROUND, + IX86_BUILTIN_STOREAPD512, + IX86_BUILTIN_STOREAPS512, + IX86_BUILTIN_STOREDQUDI512, + IX86_BUILTIN_STOREDQUSI512, + IX86_BUILTIN_STOREUPD512, + IX86_BUILTIN_STOREUPS512, + IX86_BUILTIN_SUBPD512, + IX86_BUILTIN_SUBPS512, + IX86_BUILTIN_SUBSD_ROUND, + IX86_BUILTIN_SUBSS_ROUND, + IX86_BUILTIN_UCMPD512, + IX86_BUILTIN_UCMPQ512, + IX86_BUILTIN_UNPCKHPD512, + IX86_BUILTIN_UNPCKHPS512, + IX86_BUILTIN_UNPCKLPD512, + IX86_BUILTIN_UNPCKLPS512, + IX86_BUILTIN_VCVTSD2SI32, + IX86_BUILTIN_VCVTSD2SI64, + IX86_BUILTIN_VCVTSD2USI32, + IX86_BUILTIN_VCVTSD2USI64, + IX86_BUILTIN_VCVTSS2SI32, + IX86_BUILTIN_VCVTSS2SI64, + IX86_BUILTIN_VCVTSS2USI32, + IX86_BUILTIN_VCVTSS2USI64, + IX86_BUILTIN_VCVTTSD2SI32, + IX86_BUILTIN_VCVTTSD2SI64, + IX86_BUILTIN_VCVTTSD2USI32, + IX86_BUILTIN_VCVTTSD2USI64, + IX86_BUILTIN_VCVTTSS2SI32, + IX86_BUILTIN_VCVTTSS2SI64, + IX86_BUILTIN_VCVTTSS2USI32, + IX86_BUILTIN_VCVTTSS2USI64, + IX86_BUILTIN_VFMADDPD512_MASK, + IX86_BUILTIN_VFMADDPD512_MASK3, + IX86_BUILTIN_VFMADDPD512_MASKZ, + IX86_BUILTIN_VFMADDPS512_MASK, + IX86_BUILTIN_VFMADDPS512_MASK3, + IX86_BUILTIN_VFMADDPS512_MASKZ, + IX86_BUILTIN_VFMADDSD3_ROUND, + IX86_BUILTIN_VFMADDSS3_ROUND, + IX86_BUILTIN_VFMADDSUBPD512_MASK, + IX86_BUILTIN_VFMADDSUBPD512_MASK3, + IX86_BUILTIN_VFMADDSUBPD512_MASKZ, + IX86_BUILTIN_VFMADDSUBPS512_MASK, + IX86_BUILTIN_VFMADDSUBPS512_MASK3, + IX86_BUILTIN_VFMADDSUBPS512_MASKZ, + IX86_BUILTIN_VFMSUBADDPD512_MASK3, + IX86_BUILTIN_VFMSUBADDPS512_MASK3, + IX86_BUILTIN_VFMSUBPD512_MASK3, + IX86_BUILTIN_VFMSUBPS512_MASK3, + IX86_BUILTIN_VFMSUBSD3_MASK3, + IX86_BUILTIN_VFMSUBSS3_MASK3, + IX86_BUILTIN_VFNMADDPD512_MASK, + IX86_BUILTIN_VFNMADDPS512_MASK, + IX86_BUILTIN_VFNMSUBPD512_MASK, + IX86_BUILTIN_VFNMSUBPD512_MASK3, + IX86_BUILTIN_VFNMSUBPS512_MASK, + IX86_BUILTIN_VFNMSUBPS512_MASK3, + IX86_BUILTIN_VPCLZCNTD512, + IX86_BUILTIN_VPCLZCNTQ512, + IX86_BUILTIN_VPCONFLICTD512, + IX86_BUILTIN_VPCONFLICTQ512, + IX86_BUILTIN_VPERMDF512, + IX86_BUILTIN_VPERMDI512, + IX86_BUILTIN_VPERMI2VARD512, + IX86_BUILTIN_VPERMI2VARPD512, + IX86_BUILTIN_VPERMI2VARPS512, + IX86_BUILTIN_VPERMI2VARQ512, + IX86_BUILTIN_VPERMILPD512, + IX86_BUILTIN_VPERMILPS512, + IX86_BUILTIN_VPERMILVARPD512, + IX86_BUILTIN_VPERMILVARPS512, + IX86_BUILTIN_VPERMT2VARD512, + IX86_BUILTIN_VPERMT2VARD512_MASKZ, + IX86_BUILTIN_VPERMT2VARPD512, + IX86_BUILTIN_VPERMT2VARPD512_MASKZ, + IX86_BUILTIN_VPERMT2VARPS512, + IX86_BUILTIN_VPERMT2VARPS512_MASKZ, + IX86_BUILTIN_VPERMT2VARQ512, + IX86_BUILTIN_VPERMT2VARQ512_MASKZ, + IX86_BUILTIN_VPERMVARDF512, + IX86_BUILTIN_VPERMVARDI512, + IX86_BUILTIN_VPERMVARSF512, + IX86_BUILTIN_VPERMVARSI512, + IX86_BUILTIN_VTERNLOGD512_MASK, + IX86_BUILTIN_VTERNLOGD512_MASKZ, + IX86_BUILTIN_VTERNLOGQ512_MASK, + IX86_BUILTIN_VTERNLOGQ512_MASKZ, + + /* Mask arithmetic operations */ + IX86_BUILTIN_KAND16, + IX86_BUILTIN_KANDN16, + IX86_BUILTIN_KNOT16, + IX86_BUILTIN_KOR16, + IX86_BUILTIN_KORTESTC16, + IX86_BUILTIN_KORTESTZ16, + IX86_BUILTIN_KUNPCKBW, + IX86_BUILTIN_KXNOR16, + IX86_BUILTIN_KXOR16, + + /* Alternate 4 and 8 element gather/scatter for the vectorizer + where all operands are 32-byte or 64-byte wide respectively. */ IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, IX86_BUILTIN_GATHERALTSIV4DI, IX86_BUILTIN_GATHERALTDIV8SI, + IX86_BUILTIN_GATHER3ALTDIV16SF, + IX86_BUILTIN_GATHER3ALTDIV16SI, + IX86_BUILTIN_GATHER3ALTSIV8DF, + IX86_BUILTIN_GATHER3ALTSIV8DI, + IX86_BUILTIN_GATHER3DIV16SF, + IX86_BUILTIN_GATHER3DIV16SI, + IX86_BUILTIN_GATHER3DIV8DF, + IX86_BUILTIN_GATHER3DIV8DI, + IX86_BUILTIN_GATHER3SIV16SF, + IX86_BUILTIN_GATHER3SIV16SI, + IX86_BUILTIN_GATHER3SIV8DF, + IX86_BUILTIN_GATHER3SIV8DI, + IX86_BUILTIN_SCATTERDIV16SF, + IX86_BUILTIN_SCATTERDIV16SI, + IX86_BUILTIN_SCATTERDIV8DF, + IX86_BUILTIN_SCATTERDIV8DI, + IX86_BUILTIN_SCATTERSIV16SF, + IX86_BUILTIN_SCATTERSIV16SI, + IX86_BUILTIN_SCATTERSIV8DF, + IX86_BUILTIN_SCATTERSIV8DI, + + /* AVX512PF */ + IX86_BUILTIN_GATHERPFDPS, + IX86_BUILTIN_GATHERPFQPS, + IX86_BUILTIN_SCATTERPFDPS, + IX86_BUILTIN_SCATTERPFQPS, + IX86_BUILTIN_EXP2PD_MASK, + IX86_BUILTIN_EXP2PS_MASK, + IX86_BUILTIN_EXP2PS, + IX86_BUILTIN_RCP28PD, + IX86_BUILTIN_RCP28PS, + IX86_BUILTIN_RSQRT28PD, + IX86_BUILTIN_RSQRT28PS, + + /* SHA builtins. */ + IX86_BUILTIN_SHA1MSG1, + IX86_BUILTIN_SHA1MSG2, + IX86_BUILTIN_SHA1NEXTE, + IX86_BUILTIN_SHA1RNDS4, + IX86_BUILTIN_SHA256MSG1, + IX86_BUILTIN_SHA256MSG2, + IX86_BUILTIN_SHA256RNDS2, /* TFmode support builtins. */ IX86_BUILTIN_INFQ, @@ -27713,10 +28342,16 @@ enum ix86_builtins IX86_BUILTIN_COPYSIGNQ, /* Vectorizer support builtins. */ + IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, IX86_BUILTIN_CPYSGNPS, IX86_BUILTIN_CPYSGNPD, IX86_BUILTIN_CPYSGNPS256, + IX86_BUILTIN_CPYSGNPS512, IX86_BUILTIN_CPYSGNPD256, + IX86_BUILTIN_CPYSGNPD512, + IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, + IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512, + /* FMA4 instructions. */ IX86_BUILTIN_VFMADDSS, @@ -28253,6 +28888,39 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI }, + /* AVX512F */ + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev16sf_mask, "__builtin_ia32_compressstoresf512_mask", IX86_BUILTIN_COMPRESSPSSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev16si_mask, "__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev8df_mask, "__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev8di_mask, "__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loaddquv16si_mask, "__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loaddquv8di_mask, "__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadupd512_mask, "__builtin_ia32_loadupd512_mask", IX86_BUILTIN_LOADUPD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadups512_mask, "__builtin_ia32_loadups512_mask", IX86_BUILTIN_LOADUPS512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_loadaps512_mask", IX86_BUILTIN_LOADAPS512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_movdqa32load512_mask", IX86_BUILTIN_MOVDQA32LOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_loadapd512_mask", IX86_BUILTIN_LOADAPD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_movdqa64load512_mask", IX86_BUILTIN_MOVDQA64LOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv16si_mask, "__builtin_ia32_storedqusi512_mask", IX86_BUILTIN_STOREDQUSI512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv8di_mask, "__builtin_ia32_storedqudi512_mask", IX86_BUILTIN_STOREDQUDI512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeupd512_mask, "__builtin_ia32_storeupd512_mask", IX86_BUILTIN_STOREUPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeups512_mask, "__builtin_ia32_storeups512_mask", IX86_BUILTIN_STOREUPS512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev16sf_mask, "__builtin_ia32_storeaps512_mask", IX86_BUILTIN_STOREAPS512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID }, { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID }, { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT }, @@ -29104,6 +29772,361 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 }, { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT }, { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 }, + + /* AVX512F */ + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_alignv16si_mask, "__builtin_ia32_alignd512_mask", IX86_BUILTIN_ALIGND512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_alignv8di_mask, "__builtin_ia32_alignq512_mask", IX86_BUILTIN_ALIGNQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv16si, "__builtin_ia32_blendmd_512_mask", IX86_BUILTIN_BLENDMD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv8df, "__builtin_ia32_blendmpd_512_mask", IX86_BUILTIN_BLENDMPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv16sf, "__builtin_ia32_blendmps_512_mask", IX86_BUILTIN_BLENDMPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv8di, "__builtin_ia32_blendmq_512_mask", IX86_BUILTIN_BLENDMQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv16sf_mask, "__builtin_ia32_broadcastf32x4_512", IX86_BUILTIN_BROADCASTF32X4_512, UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv8df_mask, "__builtin_ia32_broadcastf64x4_512", IX86_BUILTIN_BROADCASTF64X4_512, UNKNOWN, (int) V8DF_FTYPE_V4DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv16si_mask, "__builtin_ia32_broadcasti32x4_512", IX86_BUILTIN_BROADCASTI32X4_512, UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv8di_mask, "__builtin_ia32_broadcasti64x4_512", IX86_BUILTIN_BROADCASTI64X4_512, UNKNOWN, (int) V8DI_FTYPE_V4DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv8df_mask, "__builtin_ia32_broadcastsd512", IX86_BUILTIN_BROADCASTSD512, UNKNOWN, (int) V8DF_FTYPE_V2DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv16sf_mask, "__builtin_ia32_broadcastss512", IX86_BUILTIN_BROADCASTSS512, UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv16si3_mask, "__builtin_ia32_cmpd512_mask", IX86_BUILTIN_CMPD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_INT_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv8di3_mask, "__builtin_ia32_cmpq512_mask", IX86_BUILTIN_CMPQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_INT_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv8df_mask, "__builtin_ia32_compressdf512_mask", IX86_BUILTIN_COMPRESSPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv16sf_mask, "__builtin_ia32_compresssf512_mask", IX86_BUILTIN_COMPRESSPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_floatv8siv8df2_mask, "__builtin_ia32_cvtdq2pd512_mask", IX86_BUILTIN_CVTDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtps2ph512_mask, "__builtin_ia32_vcvtps2ph512_mask", IX86_BUILTIN_CVTPS2PH512, UNKNOWN, (int) V16HI_FTYPE_V16SF_INT_V16HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufloatv8siv8df_mask, "__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_cvtusi2sd32, "__builtin_ia32_cvtusi2sd32", IX86_BUILTIN_CVTUSI2SD32, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expanddf512_maskz", IX86_BUILTIN_EXPANDPD512Z, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandsf512_mask", IX86_BUILTIN_EXPANDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandsf512_maskz", IX86_BUILTIN_EXPANDPS512Z, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextractf32x4_mask, "__builtin_ia32_extractf32x4_mask", IX86_BUILTIN_EXTRACTF32X4, UNKNOWN, (int) V4SF_FTYPE_V16SF_INT_V4SF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextractf64x4_mask, "__builtin_ia32_extractf64x4_mask", IX86_BUILTIN_EXTRACTF64X4, UNKNOWN, (int) V4DF_FTYPE_V8DF_INT_V4DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextracti32x4_mask, "__builtin_ia32_extracti32x4_mask", IX86_BUILTIN_EXTRACTI32X4, UNKNOWN, (int) V4SI_FTYPE_V16SI_INT_V4SI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextracti64x4_mask, "__builtin_ia32_extracti64x4_mask", IX86_BUILTIN_EXTRACTI64X4, UNKNOWN, (int) V4DI_FTYPE_V8DI_INT_V4DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinsertf32x4_mask, "__builtin_ia32_insertf32x4_mask", IX86_BUILTIN_INSERTF32X4, UNKNOWN, (int) V16SF_FTYPE_V16SF_V4SF_INT_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinsertf64x4_mask, "__builtin_ia32_insertf64x4_mask", IX86_BUILTIN_INSERTF64X4, UNKNOWN, (int) V8DF_FTYPE_V8DF_V4DF_INT_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinserti32x4_mask, "__builtin_ia32_inserti32x4_mask", IX86_BUILTIN_INSERTI32X4, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinserti64x4_mask, "__builtin_ia32_inserti64x4_mask", IX86_BUILTIN_INSERTI64X4, UNKNOWN, (int) V8DI_FTYPE_V8DI_V4DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_movapd512_mask", IX86_BUILTIN_MOVAPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_movaps512_mask", IX86_BUILTIN_MOVAPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movddup512_mask, "__builtin_ia32_movddup512_mask", IX86_BUILTIN_MOVDDUP512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_movdqa32_512_mask", IX86_BUILTIN_MOVDQA32_512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_movdqa64_512_mask", IX86_BUILTIN_MOVDQA64_512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movshdup512_mask, "__builtin_ia32_movshdup512_mask", IX86_BUILTIN_MOVSHDUP512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movsldup512_mask, "__builtin_ia32_movsldup512_mask", IX86_BUILTIN_MOVSLDUP512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_absv16si2_mask, "__builtin_ia32_pabsd512_mask", IX86_BUILTIN_PABSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_absv8di2_mask, "__builtin_ia32_pabsq512_mask", IX86_BUILTIN_PABSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv16si3_mask, "__builtin_ia32_paddd512_mask", IX86_BUILTIN_PADDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv8di3_mask, "__builtin_ia32_paddq512_mask", IX86_BUILTIN_PADDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_andv16si3_mask, "__builtin_ia32_pandd512_mask", IX86_BUILTIN_PANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_andnotv16si3_mask, "__builtin_ia32_pandnd512_mask", IX86_BUILTIN_PANDND512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_andnotv8di3_mask, "__builtin_ia32_pandnq512_mask", IX86_BUILTIN_PANDNQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_andv8di3_mask, "__builtin_ia32_pandq512_mask", IX86_BUILTIN_PANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv16si_mask, "__builtin_ia32_pbroadcastd512", IX86_BUILTIN_PBROADCASTD512, UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dup_gprv16si_mask, "__builtin_ia32_pbroadcastd512_gpr_mask", IX86_BUILTIN_PBROADCASTD512_GPR, UNKNOWN, (int) V16SI_FTYPE_SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512cd_maskb_vec_dupv8di, "__builtin_ia32_broadcastmb512", IX86_BUILTIN_PBROADCASTMB512, UNKNOWN, (int) V8DI_FTYPE_QI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512cd_maskw_vec_dupv16si, "__builtin_ia32_broadcastmw512", IX86_BUILTIN_PBROADCASTMW512, UNKNOWN, (int) V16SI_FTYPE_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv8di_mask, "__builtin_ia32_pbroadcastq512", IX86_BUILTIN_PBROADCASTQ512, UNKNOWN, (int) V8DI_FTYPE_V2DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vec_dup_gprv8di_mask, "__builtin_ia32_pbroadcastq512_gpr_mask", IX86_BUILTIN_PBROADCASTQ512_GPR, UNKNOWN, (int) V8DI_FTYPE_DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F & ~OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vec_dup_memv8di_mask, "__builtin_ia32_pbroadcastq512_mem_mask", IX86_BUILTIN_PBROADCASTQ512_MEM, UNKNOWN, (int) V8DI_FTYPE_DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_eqv16si3_mask, "__builtin_ia32_pcmpeqd512_mask", IX86_BUILTIN_PCMPEQD512_MASK, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_eqv8di3_mask, "__builtin_ia32_pcmpeqq512_mask", IX86_BUILTIN_PCMPEQQ512_MASK, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_gtv16si3_mask, "__builtin_ia32_pcmpgtd512_mask", IX86_BUILTIN_PCMPGTD512_MASK, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_gtv8di3_mask, "__builtin_ia32_pcmpgtq512_mask", IX86_BUILTIN_PCMPGTQ512_MASK, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv16si_mask, "__builtin_ia32_compresssi512_mask", IX86_BUILTIN_PCOMPRESSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv8di_mask, "__builtin_ia32_compressdi512_mask", IX86_BUILTIN_PCOMPRESSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandsi512_mask", IX86_BUILTIN_PEXPANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandsi512_maskz", IX86_BUILTIN_PEXPANDD512Z, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expanddi512_mask", IX86_BUILTIN_PEXPANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expanddi512_maskz", IX86_BUILTIN_PEXPANDQ512Z, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv16si3_mask, "__builtin_ia32_pmaxsd512_mask", IX86_BUILTIN_PMAXSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv8di3_mask, "__builtin_ia32_pmaxsq512_mask", IX86_BUILTIN_PMAXSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_umaxv16si3_mask, "__builtin_ia32_pmaxud512_mask", IX86_BUILTIN_PMAXUD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_umaxv8di3_mask, "__builtin_ia32_pmaxuq512_mask", IX86_BUILTIN_PMAXUQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv16si3_mask, "__builtin_ia32_pminsd512_mask", IX86_BUILTIN_PMINSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv8di3_mask, "__builtin_ia32_pminsq512_mask", IX86_BUILTIN_PMINSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_uminv16si3_mask, "__builtin_ia32_pminud512_mask", IX86_BUILTIN_PMINUD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_uminv8di3_mask, "__builtin_ia32_pminuq512_mask", IX86_BUILTIN_PMINUQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16qi2_mask, "__builtin_ia32_pmovdb512_mask", IX86_BUILTIN_PMOVDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16hi2_mask, "__builtin_ia32_pmovdw512_mask", IX86_BUILTIN_PMOVDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div16qi2_mask, "__builtin_ia32_pmovqb512_mask", IX86_BUILTIN_PMOVQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8si2_mask, "__builtin_ia32_pmovqd512_mask", IX86_BUILTIN_PMOVQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8hi2_mask, "__builtin_ia32_pmovqw512_mask", IX86_BUILTIN_PMOVQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16qi2_mask, "__builtin_ia32_pmovsdb512_mask", IX86_BUILTIN_PMOVSDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16hi2_mask, "__builtin_ia32_pmovsdw512_mask", IX86_BUILTIN_PMOVSDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div16qi2_mask, "__builtin_ia32_pmovsqb512_mask", IX86_BUILTIN_PMOVSQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8si2_mask, "__builtin_ia32_pmovsqd512_mask", IX86_BUILTIN_PMOVSQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8hi2_mask, "__builtin_ia32_pmovsqw512_mask", IX86_BUILTIN_PMOVSQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv16qiv16si2_mask, "__builtin_ia32_pmovsxbd512_mask", IX86_BUILTIN_PMOVSXBD512, UNKNOWN, (int) V16SI_FTYPE_V16QI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8qiv8di2_mask, "__builtin_ia32_pmovsxbq512_mask", IX86_BUILTIN_PMOVSXBQ512, UNKNOWN, (int) V8DI_FTYPE_V16QI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8siv8di2_mask, "__builtin_ia32_pmovsxdq512_mask", IX86_BUILTIN_PMOVSXDQ512, UNKNOWN, (int) V8DI_FTYPE_V8SI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv16hiv16si2_mask, "__builtin_ia32_pmovsxwd512_mask", IX86_BUILTIN_PMOVSXWD512, UNKNOWN, (int) V16SI_FTYPE_V16HI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8hiv8di2_mask, "__builtin_ia32_pmovsxwq512_mask", IX86_BUILTIN_PMOVSXWQ512, UNKNOWN, (int) V8DI_FTYPE_V8HI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16qi2_mask, "__builtin_ia32_pmovusdb512_mask", IX86_BUILTIN_PMOVUSDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16hi2_mask, "__builtin_ia32_pmovusdw512_mask", IX86_BUILTIN_PMOVUSDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div16qi2_mask, "__builtin_ia32_pmovusqb512_mask", IX86_BUILTIN_PMOVUSQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8si2_mask, "__builtin_ia32_pmovusqd512_mask", IX86_BUILTIN_PMOVUSQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8hi2_mask, "__builtin_ia32_pmovusqw512_mask", IX86_BUILTIN_PMOVUSQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv16qiv16si2_mask, "__builtin_ia32_pmovzxbd512_mask", IX86_BUILTIN_PMOVZXBD512, UNKNOWN, (int) V16SI_FTYPE_V16QI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8qiv8di2_mask, "__builtin_ia32_pmovzxbq512_mask", IX86_BUILTIN_PMOVZXBQ512, UNKNOWN, (int) V8DI_FTYPE_V16QI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8siv8di2_mask, "__builtin_ia32_pmovzxdq512_mask", IX86_BUILTIN_PMOVZXDQ512, UNKNOWN, (int) V8DI_FTYPE_V8SI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv16hiv16si2_mask, "__builtin_ia32_pmovzxwd512_mask", IX86_BUILTIN_PMOVZXWD512, UNKNOWN, (int) V16SI_FTYPE_V16HI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8hiv8di2_mask, "__builtin_ia32_pmovzxwq512_mask", IX86_BUILTIN_PMOVZXWQ512, UNKNOWN, (int) V8DI_FTYPE_V8HI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_vec_widen_smult_even_v16si_mask, "__builtin_ia32_pmuldq512_mask", IX86_BUILTIN_PMULDQ512, UNKNOWN, (int) V8DI_FTYPE_V16SI_V16SI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv16si3_mask, "__builtin_ia32_pmulld512_mask" , IX86_BUILTIN_PMULLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_vec_widen_umult_even_v16si_mask, "__builtin_ia32_pmuludq512_mask", IX86_BUILTIN_PMULUDQ512, UNKNOWN, (int) V8DI_FTYPE_V16SI_V16SI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorv16si3_mask, "__builtin_ia32_pord512_mask", IX86_BUILTIN_PORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorv8di3_mask, "__builtin_ia32_porq512_mask", IX86_BUILTIN_PORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolv16si_mask, "__builtin_ia32_prold512_mask", IX86_BUILTIN_PROLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolv8di_mask, "__builtin_ia32_prolq512_mask", IX86_BUILTIN_PROLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolvv16si_mask, "__builtin_ia32_prolvd512_mask", IX86_BUILTIN_PROLVD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolvv8di_mask, "__builtin_ia32_prolvq512_mask", IX86_BUILTIN_PROLVQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorv16si_mask, "__builtin_ia32_prord512_mask", IX86_BUILTIN_PRORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorv8di_mask, "__builtin_ia32_prorq512_mask", IX86_BUILTIN_PRORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorvv16si_mask, "__builtin_ia32_prorvd512_mask", IX86_BUILTIN_PRORVD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorvv8di_mask, "__builtin_ia32_prorvq512_mask", IX86_BUILTIN_PRORVQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_pshufdv3_mask, "__builtin_ia32_pshufd512_mask", IX86_BUILTIN_PSHUFD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv16si3_mask, "__builtin_ia32_pslld512_mask", IX86_BUILTIN_PSLLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv16si3_mask, "__builtin_ia32_pslldi512_mask", IX86_BUILTIN_PSLLDI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv8di3_mask, "__builtin_ia32_psllq512_mask", IX86_BUILTIN_PSLLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv8di3_mask, "__builtin_ia32_psllqi512_mask", IX86_BUILTIN_PSLLQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashlvv16si_mask, "__builtin_ia32_psllv16si_mask", IX86_BUILTIN_PSLLVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashlvv8di_mask, "__builtin_ia32_psllv8di_mask", IX86_BUILTIN_PSLLVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv16si3_mask, "__builtin_ia32_psrad512_mask", IX86_BUILTIN_PSRAD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv16si3_mask, "__builtin_ia32_psradi512_mask", IX86_BUILTIN_PSRADI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv8di3_mask, "__builtin_ia32_psraq512_mask", IX86_BUILTIN_PSRAQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv8di3_mask, "__builtin_ia32_psraqi512_mask", IX86_BUILTIN_PSRAQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashrvv16si_mask, "__builtin_ia32_psrav16si_mask", IX86_BUILTIN_PSRAVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashrvv8di_mask, "__builtin_ia32_psrav8di_mask", IX86_BUILTIN_PSRAVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv16si3_mask, "__builtin_ia32_psrld512_mask", IX86_BUILTIN_PSRLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv16si3_mask, "__builtin_ia32_psrldi512_mask", IX86_BUILTIN_PSRLDI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv8di3_mask, "__builtin_ia32_psrlq512_mask", IX86_BUILTIN_PSRLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv8di3_mask, "__builtin_ia32_psrlqi512_mask", IX86_BUILTIN_PSRLQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_lshrvv16si_mask, "__builtin_ia32_psrlv16si_mask", IX86_BUILTIN_PSRLVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_lshrvv8di_mask, "__builtin_ia32_psrlv8di_mask", IX86_BUILTIN_PSRLVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv16si3_mask, "__builtin_ia32_psubd512_mask", IX86_BUILTIN_PSUBD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv8di3_mask, "__builtin_ia32_psubq512_mask", IX86_BUILTIN_PSUBQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testmv16si3_mask, "__builtin_ia32_ptestmd512", IX86_BUILTIN_PTESTMD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testmv8di3_mask, "__builtin_ia32_ptestmq512", IX86_BUILTIN_PTESTMQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512f_testnmv16si3_mask, "__builtin_ia32_ptestnmd512", IX86_BUILTIN_PTESTNMD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512f_testnmv8di3_mask, "__builtin_ia32_ptestnmq512", IX86_BUILTIN_PTESTNMQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_highv16si_mask, "__builtin_ia32_punpckhdq512_mask", IX86_BUILTIN_PUNPCKHDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_highv8di_mask, "__builtin_ia32_punpckhqdq512_mask", IX86_BUILTIN_PUNPCKHQDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_lowv16si_mask, "__builtin_ia32_punpckldq512_mask", IX86_BUILTIN_PUNPCKLDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_lowv8di_mask, "__builtin_ia32_punpcklqdq512_mask", IX86_BUILTIN_PUNPCKLQDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorv16si3_mask, "__builtin_ia32_pxord512_mask", IX86_BUILTIN_PXORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorv8di3_mask, "__builtin_ia32_pxorq512_mask", IX86_BUILTIN_PXORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rcp14v8df_mask, "__builtin_ia32_rcp14pd512_mask", IX86_BUILTIN_RCP14PD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rcp14v16sf_mask, "__builtin_ia32_rcp14ps512_mask", IX86_BUILTIN_RCP14PS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_srcp14v2df, "__builtin_ia32_rcp14sd", IX86_BUILTIN_RCP14SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_srcp14v4sf, "__builtin_ia32_rcp14ss", IX86_BUILTIN_RCP14SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v8df_mask, "__builtin_ia32_rsqrt14pd512_mask", IX86_BUILTIN_RSQRT14PD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v16sf_mask, "__builtin_ia32_rsqrt14ps512_mask", IX86_BUILTIN_RSQRT14PS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v2df, "__builtin_ia32_rsqrt14sd", IX86_BUILTIN_RSQRT14SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v4sf, "__builtin_ia32_rsqrt14ss", IX86_BUILTIN_RSQRT14SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shufpd512_mask, "__builtin_ia32_shufpd512_mask", IX86_BUILTIN_SHUFPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shufps512_mask, "__builtin_ia32_shufps512_mask", IX86_BUILTIN_SHUFPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_f32x4_mask, "__builtin_ia32_shuf_f32x4_mask", IX86_BUILTIN_SHUF_F32x4, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_f64x2_mask, "__builtin_ia32_shuf_f64x2_mask", IX86_BUILTIN_SHUF_F64x2, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_i32x4_mask, "__builtin_ia32_shuf_i32x4_mask", IX86_BUILTIN_SHUF_I32x4, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_i64x2_mask, "__builtin_ia32_shuf_i64x2_mask", IX86_BUILTIN_SHUF_I64x2, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ucmpv16si3_mask, "__builtin_ia32_ucmpd512_mask", IX86_BUILTIN_UCMPD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_INT_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ucmpv8di3_mask, "__builtin_ia32_ucmpq512_mask", IX86_BUILTIN_UCMPQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_INT_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpckhpd512_mask, "__builtin_ia32_unpckhpd512_mask", IX86_BUILTIN_UNPCKHPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpckhps512_mask, "__builtin_ia32_unpckhps512_mask", IX86_BUILTIN_UNPCKHPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpcklpd512_mask, "__builtin_ia32_unpcklpd512_mask", IX86_BUILTIN_UNPCKLPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpcklps512_mask, "__builtin_ia32_unpcklps512_mask", IX86_BUILTIN_UNPCKLPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_clzv16si2_mask, "__builtin_ia32_vplzcntd_512_mask", IX86_BUILTIN_VPCLZCNTD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_clzv8di2_mask, "__builtin_ia32_vplzcntq_512_mask", IX86_BUILTIN_VPCLZCNTQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_conflictv16si_mask, "__builtin_ia32_vpconflictsi_512_mask", IX86_BUILTIN_VPCONFLICTD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512CD, CODE_FOR_conflictv8di_mask, "__builtin_ia32_vpconflictdi_512_mask", IX86_BUILTIN_VPCONFLICTQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permv8df_mask, "__builtin_ia32_permdf512_mask", IX86_BUILTIN_VPERMDF512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permv8di_mask, "__builtin_ia32_permdi512_mask", IX86_BUILTIN_VPERMDI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv16si3_mask, "__builtin_ia32_vpermi2vard512_mask", IX86_BUILTIN_VPERMI2VARD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv8df3_mask, "__builtin_ia32_vpermi2varpd512_mask", IX86_BUILTIN_VPERMI2VARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv16sf3_mask, "__builtin_ia32_vpermi2varps512_mask", IX86_BUILTIN_VPERMI2VARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv8di3_mask, "__builtin_ia32_vpermi2varq512_mask", IX86_BUILTIN_VPERMI2VARQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilv8df_mask, "__builtin_ia32_vpermilpd512_mask", IX86_BUILTIN_VPERMILPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilv16sf_mask, "__builtin_ia32_vpermilps512_mask", IX86_BUILTIN_VPERMILPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilvarv8df3_mask, "__builtin_ia32_vpermilvarpd512_mask", IX86_BUILTIN_VPERMILVARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilvarv16sf3_mask, "__builtin_ia32_vpermilvarps512_mask", IX86_BUILTIN_VPERMILVARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16si3_mask, "__builtin_ia32_vpermt2vard512_mask", IX86_BUILTIN_VPERMT2VARD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16si3_maskz, "__builtin_ia32_vpermt2vard512_maskz", IX86_BUILTIN_VPERMT2VARD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8df3_mask, "__builtin_ia32_vpermt2varpd512_mask", IX86_BUILTIN_VPERMT2VARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DI_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8df3_maskz, "__builtin_ia32_vpermt2varpd512_maskz", IX86_BUILTIN_VPERMT2VARPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DI_V8DF_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16sf3_mask, "__builtin_ia32_vpermt2varps512_mask", IX86_BUILTIN_VPERMT2VARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16sf3_maskz, "__builtin_ia32_vpermt2varps512_maskz", IX86_BUILTIN_VPERMT2VARPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8di3_mask, "__builtin_ia32_vpermt2varq512_mask", IX86_BUILTIN_VPERMT2VARQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8di3_maskz, "__builtin_ia32_vpermt2varq512_maskz", IX86_BUILTIN_VPERMT2VARQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv8df_mask, "__builtin_ia32_permvardf512_mask", IX86_BUILTIN_VPERMVARDF512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv8di_mask, "__builtin_ia32_permvardi512_mask", IX86_BUILTIN_VPERMVARDI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv16sf_mask, "__builtin_ia32_permvarsf512_mask", IX86_BUILTIN_VPERMVARSF512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv16si_mask, "__builtin_ia32_permvarsi512_mask", IX86_BUILTIN_VPERMVARSI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv16si_mask, "__builtin_ia32_pternlogd512_mask", IX86_BUILTIN_VTERNLOGD512_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI }, + + { OPTION_MASK_ISA_AVX512F, CODE_FOR_copysignv16sf3, "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_copysignv8df3, "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_roundv8df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix512", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512, UNKNOWN, (int) V16SI_FTYPE_V8DF_V8DF }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_floorpd_vec_pack_sfix512", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_FLOOR, (int) V16SI_FTYPE_V8DF_V8DF_ROUND }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_ceilpd_vec_pack_sfix512", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_CEIL, (int) V16SI_FTYPE_V8DF_V8DF_ROUND }, + + /* Mask arithmetic operations */ + { OPTION_MASK_ISA_AVX512F, CODE_FOR_andhi3, "__builtin_ia32_kandhi", IX86_BUILTIN_KAND16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_kandnhi, "__builtin_ia32_kandnhi", IX86_BUILTIN_KANDN16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_one_cmplhi2, "__builtin_ia32_knothi", IX86_BUILTIN_KNOT16, UNKNOWN, (int) HI_FTYPE_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorhi3, "__builtin_ia32_korhi", IX86_BUILTIN_KOR16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestchi, "__builtin_ia32_kortestchi", IX86_BUILTIN_KORTESTC16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestzhi, "__builtin_ia32_kortestzhi", IX86_BUILTIN_KORTESTZ16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_kunpckhi, "__builtin_ia32_kunpckhi", IX86_BUILTIN_KUNPCKBW, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_kxnorhi, "__builtin_ia32_kxnorhi", IX86_BUILTIN_KXNOR16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorhi3, "__builtin_ia32_kxorhi", IX86_BUILTIN_KXOR16, UNKNOWN, (int) HI_FTYPE_HI_HI }, + + /* SHA */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1msg1, 0, IX86_BUILTIN_SHA1MSG1, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1msg2, 0, IX86_BUILTIN_SHA1MSG2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1nexte, 0, IX86_BUILTIN_SHA1NEXTE, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1rnds4, 0, IX86_BUILTIN_SHA1RNDS4, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256msg1, 0, IX86_BUILTIN_SHA256MSG1, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256msg2, 0, IX86_BUILTIN_SHA256MSG2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256rnds2, 0, IX86_BUILTIN_SHA256RNDS2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI }, +}; + +/* Builtins with rounding support. */ +static const struct builtin_description bdesc_round_args[] = +{ + /* AVX512F */ + { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv8df3_mask_round, "__builtin_ia32_addpd512_mask", IX86_BUILTIN_ADDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv16sf3_mask_round, "__builtin_ia32_addps512_mask", IX86_BUILTIN_ADDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmaddv2df3_round, "__builtin_ia32_addsd_round", IX86_BUILTIN_ADDSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmaddv4sf3_round, "__builtin_ia32_addss_round", IX86_BUILTIN_ADDSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv8df3_mask_round, "__builtin_ia32_cmppd512_mask", IX86_BUILTIN_CMPPD512, UNKNOWN, (int) QI_FTYPE_V8DF_V8DF_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv16sf3_mask_round, "__builtin_ia32_cmpps512_mask", IX86_BUILTIN_CMPPS512, UNKNOWN, (int) HI_FTYPE_V16SF_V16SF_INT_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmcmpv2df3_mask_round, "__builtin_ia32_cmpsd_mask", IX86_BUILTIN_CMPSD_MASK, UNKNOWN, (int) QI_FTYPE_V2DF_V2DF_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmcmpv4sf3_mask_round, "__builtin_ia32_cmpss_mask", IX86_BUILTIN_CMPSS_MASK, UNKNOWN, (int) QI_FTYPE_V4SF_V4SF_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_comi_round, "__builtin_ia32_vcomisd", IX86_BUILTIN_COMIDF, UNKNOWN, (int) INT_FTYPE_V2DF_V2DF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_comi_round, "__builtin_ia32_vcomiss", IX86_BUILTIN_COMISF, UNKNOWN, (int) INT_FTYPE_V4SF_V4SF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_floatv16siv16sf2_mask_round, "__builtin_ia32_cvtdq2ps512_mask", IX86_BUILTIN_CVTDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtpd2dq512_mask_round, "__builtin_ia32_cvtpd2dq512_mask", IX86_BUILTIN_CVTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtpd2ps512_mask_round, "__builtin_ia32_cvtpd2ps512_mask", IX86_BUILTIN_CVTPD2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DF_V8SF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ufix_notruncv8dfv8si_mask_round, "__builtin_ia32_cvtpd2udq512_mask", IX86_BUILTIN_CVTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtph2ps512_mask_round, "__builtin_ia32_vcvtph2ps512_mask", IX86_BUILTIN_CVTPH2PS512, UNKNOWN, (int) V16SF_FTYPE_V16HI_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2dq512_mask", IX86_BUILTIN_CVTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq_round, "__builtin_ia32_cvtsi2sd64", IX86_BUILTIN_CVTSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT64_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvtsi2ss_round, "__builtin_ia32_cvtsi2ss32", IX86_BUILTIN_CVTSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq_round, "__builtin_ia32_cvtsi2ss64", IX86_BUILTIN_CVTSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT64_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtss2sd_round, "__builtin_ia32_cvtss2sd_round", IX86_BUILTIN_CVTSS2SD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_fix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2dq512_mask", IX86_BUILTIN_CVTTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2udq512_mask", IX86_BUILTIN_CVTTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_fix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2dq512_mask", IX86_BUILTIN_CVTTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2udq512_mask", IX86_BUILTIN_CVTTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufloatv16siv16sf2_mask_round, "__builtin_ia32_cvtudq2ps512_mask", IX86_BUILTIN_CVTUDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_cvtusi2sd64_round, "__builtin_ia32_cvtusi2sd64", IX86_BUILTIN_CVTUSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT64_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_cvtusi2ss32_round, "__builtin_ia32_cvtusi2ss32", IX86_BUILTIN_CVTUSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_cvtusi2ss64_round, "__builtin_ia32_cvtusi2ss64", IX86_BUILTIN_CVTUSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT64_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_divv8df3_mask_round, "__builtin_ia32_divpd512_mask", IX86_BUILTIN_DIVPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_divv16sf3_mask_round, "__builtin_ia32_divps512_mask", IX86_BUILTIN_DIVPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmdivv2df3_round, "__builtin_ia32_divsd_round", IX86_BUILTIN_DIVSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmdivv4sf3_round, "__builtin_ia32_divss_round", IX86_BUILTIN_DIVSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv8df_mask_round, "__builtin_ia32_fixupimmpd512_mask", IX86_BUILTIN_FIXUPIMMPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv8df_maskz_round, "__builtin_ia32_fixupimmpd512_maskz", IX86_BUILTIN_FIXUPIMMPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv16sf_mask_round, "__builtin_ia32_fixupimmps512_mask", IX86_BUILTIN_FIXUPIMMPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv16sf_maskz_round, "__builtin_ia32_fixupimmps512_maskz", IX86_BUILTIN_FIXUPIMMPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv2df_mask_round, "__builtin_ia32_fixupimmsd_mask", IX86_BUILTIN_FIXUPIMMSD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv2df_maskz_round, "__builtin_ia32_fixupimmsd_maskz", IX86_BUILTIN_FIXUPIMMSD128_MASKZ, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv4sf_mask_round, "__builtin_ia32_fixupimmss_mask", IX86_BUILTIN_FIXUPIMMSS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv4sf_maskz_round, "__builtin_ia32_fixupimmss_maskz", IX86_BUILTIN_FIXUPIMMSS128_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getexpv8df_mask_round, "__builtin_ia32_getexppd512_mask", IX86_BUILTIN_GETEXPPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getexpv16sf_mask_round, "__builtin_ia32_getexpps512_mask", IX86_BUILTIN_GETEXPPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sgetexpv2df_round, "__builtin_ia32_getexpsd128_round", IX86_BUILTIN_GETEXPSD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sgetexpv4sf_round, "__builtin_ia32_getexpss128_round", IX86_BUILTIN_GETEXPSS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv8df_mask_round, "__builtin_ia32_getmantpd512_mask", IX86_BUILTIN_GETMANTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv16sf_mask_round, "__builtin_ia32_getmantps512_mask", IX86_BUILTIN_GETMANTPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv2df_round, "__builtin_ia32_getmantsd_round", IX86_BUILTIN_GETMANTSD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv4sf_round, "__builtin_ia32_getmantss_round", IX86_BUILTIN_GETMANTSS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv8df3_mask_round, "__builtin_ia32_maxpd512_mask", IX86_BUILTIN_MAXPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv16sf3_mask_round, "__builtin_ia32_maxps512_mask", IX86_BUILTIN_MAXPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsmaxv2df3_round, "__builtin_ia32_maxsd_round", IX86_BUILTIN_MAXSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsmaxv4sf3_round, "__builtin_ia32_maxss_round", IX86_BUILTIN_MAXSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv8df3_mask_round, "__builtin_ia32_minpd512_mask", IX86_BUILTIN_MINPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv16sf3_mask_round, "__builtin_ia32_minps512_mask", IX86_BUILTIN_MINPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsminv2df3_round, "__builtin_ia32_minsd_round", IX86_BUILTIN_MINSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsminv4sf3_round, "__builtin_ia32_minss_round", IX86_BUILTIN_MINSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv8df3_mask_round, "__builtin_ia32_mulpd512_mask", IX86_BUILTIN_MULPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv16sf3_mask_round, "__builtin_ia32_mulps512_mask", IX86_BUILTIN_MULPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmmulv2df3_round, "__builtin_ia32_mulsd_round", IX86_BUILTIN_MULSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmmulv4sf3_round, "__builtin_ia32_mulss_round", IX86_BUILTIN_MULSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev8df_mask_round, "__builtin_ia32_rndscalepd_mask", IX86_BUILTIN_RNDSCALEPD, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev16sf_mask_round, "__builtin_ia32_rndscaleps_mask", IX86_BUILTIN_RNDSCALEPS, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev2df_round, "__builtin_ia32_rndscalesd_round", IX86_BUILTIN_RNDSCALESD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev4sf_round, "__builtin_ia32_rndscaless_round", IX86_BUILTIN_RNDSCALESS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_scalefv8df_mask_round, "__builtin_ia32_scalefpd512_mask", IX86_BUILTIN_SCALEFPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_scalefv16sf_mask_round, "__builtin_ia32_scalefps512_mask", IX86_BUILTIN_SCALEFPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmscalefv2df_round, "__builtin_ia32_scalefsd_round", IX86_BUILTIN_SCALEFSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmscalefv4sf_round, "__builtin_ia32_scalefss_round", IX86_BUILTIN_SCALEFSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv8df2_mask_round, "__builtin_ia32_sqrtpd512_mask", IX86_BUILTIN_SQRTPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv16sf2_mask_round, "__builtin_ia32_sqrtps512_mask", IX86_BUILTIN_SQRTPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsqrtv2df2_round, "__builtin_ia32_sqrtsd_round", IX86_BUILTIN_SQRTSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsqrtv4sf2_round, "__builtin_ia32_sqrtss_round", IX86_BUILTIN_SQRTSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv8df3_mask_round, "__builtin_ia32_subpd512_mask", IX86_BUILTIN_SUBPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv16sf3_mask_round, "__builtin_ia32_subps512_mask", IX86_BUILTIN_SUBPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsubv2df3_round, "__builtin_ia32_subsd_round", IX86_BUILTIN_SUBSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsubv4sf3_round, "__builtin_ia32_subss_round", IX86_BUILTIN_SUBSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtsd2si_round, "__builtin_ia32_vcvtsd2si32", IX86_BUILTIN_VCVTSD2SI32, UNKNOWN, (int) INT_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq_round, "__builtin_ia32_vcvtsd2si64", IX86_BUILTIN_VCVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtsd2usi_round, "__builtin_ia32_vcvtsd2usi32", IX86_BUILTIN_VCVTSD2USI32, UNKNOWN, (int) UINT_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvtsd2usiq_round, "__builtin_ia32_vcvtsd2usi64", IX86_BUILTIN_VCVTSD2USI64, UNKNOWN, (int) UINT64_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvtss2si_round, "__builtin_ia32_vcvtss2si32", IX86_BUILTIN_VCVTSS2SI32, UNKNOWN, (int) INT_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq_round, "__builtin_ia32_vcvtss2si64", IX86_BUILTIN_VCVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtss2usi_round, "__builtin_ia32_vcvtss2usi32", IX86_BUILTIN_VCVTSS2USI32, UNKNOWN, (int) UINT_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvtss2usiq_round, "__builtin_ia32_vcvtss2usi64", IX86_BUILTIN_VCVTSS2USI64, UNKNOWN, (int) UINT64_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvttsd2si_round, "__builtin_ia32_vcvttsd2si32", IX86_BUILTIN_VCVTTSD2SI32, UNKNOWN, (int) INT_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq_round, "__builtin_ia32_vcvttsd2si64", IX86_BUILTIN_VCVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvttsd2usi_round, "__builtin_ia32_vcvttsd2usi32", IX86_BUILTIN_VCVTTSD2USI32, UNKNOWN, (int) UINT_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvttsd2usiq_round, "__builtin_ia32_vcvttsd2usi64", IX86_BUILTIN_VCVTTSD2USI64, UNKNOWN, (int) UINT64_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvttss2si_round, "__builtin_ia32_vcvttss2si32", IX86_BUILTIN_VCVTTSS2SI32, UNKNOWN, (int) INT_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq_round, "__builtin_ia32_vcvttss2si64", IX86_BUILTIN_VCVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvttss2usi_round, "__builtin_ia32_vcvttss2usi32", IX86_BUILTIN_VCVTTSS2USI32, UNKNOWN, (int) UINT_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvttss2usiq_round, "__builtin_ia32_vcvttss2usi64", IX86_BUILTIN_VCVTTSS2USI64, UNKNOWN, (int) UINT64_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_mask_round, "__builtin_ia32_vfmaddpd512_mask", IX86_BUILTIN_VFMADDPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_mask3_round, "__builtin_ia32_vfmaddpd512_mask3", IX86_BUILTIN_VFMADDPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_maskz_round, "__builtin_ia32_vfmaddpd512_maskz", IX86_BUILTIN_VFMADDPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_mask_round, "__builtin_ia32_vfmaddps512_mask", IX86_BUILTIN_VFMADDPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_mask3_round, "__builtin_ia32_vfmaddps512_mask3", IX86_BUILTIN_VFMADDPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_maskz_round, "__builtin_ia32_vfmaddps512_maskz", IX86_BUILTIN_VFMADDPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_fmai_vmfmadd_v2df_round, "__builtin_ia32_vfmaddsd3_round", IX86_BUILTIN_VFMADDSD3_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_fmai_vmfmadd_v4sf_round, "__builtin_ia32_vfmaddss3_round", IX86_BUILTIN_VFMADDSS3_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_mask_round, "__builtin_ia32_vfmaddsubpd512_mask", IX86_BUILTIN_VFMADDSUBPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_mask3_round, "__builtin_ia32_vfmaddsubpd512_mask3", IX86_BUILTIN_VFMADDSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_maskz_round, "__builtin_ia32_vfmaddsubpd512_maskz", IX86_BUILTIN_VFMADDSUBPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_mask_round, "__builtin_ia32_vfmaddsubps512_mask", IX86_BUILTIN_VFMADDSUBPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_mask3_round, "__builtin_ia32_vfmaddsubps512_mask3", IX86_BUILTIN_VFMADDSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_maskz_round, "__builtin_ia32_vfmaddsubps512_maskz", IX86_BUILTIN_VFMADDSUBPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v8df_mask3_round, "__builtin_ia32_vfmsubaddpd512_mask3", IX86_BUILTIN_VFMSUBADDPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v16sf_mask3_round, "__builtin_ia32_vfmsubaddps512_mask3", IX86_BUILTIN_VFMSUBADDPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v8df_mask3_round, "__builtin_ia32_vfmsubpd512_mask3", IX86_BUILTIN_VFMSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v16sf_mask3_round, "__builtin_ia32_vfmsubps512_mask3", IX86_BUILTIN_VFMSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v8df_mask_round, "__builtin_ia32_vfnmaddpd512_mask", IX86_BUILTIN_VFNMADDPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v16sf_mask_round, "__builtin_ia32_vfnmaddps512_mask", IX86_BUILTIN_VFNMADDPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v8df_mask_round, "__builtin_ia32_vfnmsubpd512_mask", IX86_BUILTIN_VFNMSUBPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v8df_mask3_round, "__builtin_ia32_vfnmsubpd512_mask3", IX86_BUILTIN_VFNMSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v16sf_mask_round, "__builtin_ia32_vfnmsubps512_mask", IX86_BUILTIN_VFNMSUBPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v16sf_mask3_round, "__builtin_ia32_vfnmsubps512_mask3", IX86_BUILTIN_VFNMSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT }, + + /* AVX512ER */ + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v8df_mask_round, "__builtin_ia32_exp2pd_mask", IX86_BUILTIN_EXP2PD_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf_mask_round, "__builtin_ia32_exp2ps_mask", IX86_BUILTIN_EXP2PS_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v8df_mask_round, "__builtin_ia32_rcp28pd_mask", IX86_BUILTIN_RCP28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v16sf_mask_round, "__builtin_ia32_rcp28ps_mask", IX86_BUILTIN_RCP28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v8df_mask_round, "__builtin_ia32_rsqrt28pd_mask", IX86_BUILTIN_RSQRT28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT }, + { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v16sf_mask_round, "__builtin_ia32_rsqrt28ps_mask", IX86_BUILTIN_RSQRT28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT }, }; /* FMA4 and XOP. */ @@ -29550,6 +30573,18 @@ ix86_init_mmx_sse_builtins (void) def_builtin_const (d->mask, d->name, ftype, d->code); } + /* Add all builtins with rounding. */ + for (i = 0, d = bdesc_round_args; + i < ARRAY_SIZE (bdesc_round_args); + i++, d++) + { + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->name, ftype, d->code); + } + /* pcmpestr[im] insns. */ for (i = 0, d = bdesc_pcmpestr; i < ARRAY_SIZE (bdesc_pcmpestr); @@ -29718,6 +30753,117 @@ ix86_init_mmx_sse_builtins (void) V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, IX86_BUILTIN_GATHERALTDIV8SI); + /* AVX512F */ + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf", + V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT, + IX86_BUILTIN_GATHER3SIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df", + V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf", + V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df", + V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si", + V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT, + IX86_BUILTIN_GATHER3SIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di", + V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si", + V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di", + V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ", + V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ", + V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT, + IX86_BUILTIN_GATHER3ALTDIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ", + V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ", + V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT, + IX86_BUILTIN_GATHER3ALTDIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf", + VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT, + IX86_BUILTIN_SCATTERSIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df", + VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT, + IX86_BUILTIN_SCATTERSIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf", + VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT, + IX86_BUILTIN_SCATTERDIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df", + VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT, + IX86_BUILTIN_SCATTERDIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si", + VOID_FTYPE_PINT_HI_V16SI_V16SI_INT, + IX86_BUILTIN_SCATTERSIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di", + VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT, + IX86_BUILTIN_SCATTERSIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si", + VOID_FTYPE_PINT_QI_V8DI_V8SI_INT, + IX86_BUILTIN_SCATTERDIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di", + VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT, + IX86_BUILTIN_SCATTERDIV8DI); + + /* AVX512PF */ + def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps", + VOID_FTYPE_HI_V16SI_PCINT_INT_INT, + IX86_BUILTIN_GATHERPFDPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps", + VOID_FTYPE_QI_V8DI_PCINT_INT_INT, + IX86_BUILTIN_GATHERPFQPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps", + VOID_FTYPE_HI_V16SI_PCINT_INT_INT, + IX86_BUILTIN_SCATTERPFDPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps", + VOID_FTYPE_QI_V8DI_PCINT_INT_INT, + IX86_BUILTIN_SCATTERPFQPS); + + /* SHA */ + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4", + V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2); + def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2", + V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2); + /* RTM. */ def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort", VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT); @@ -29958,16 +31104,21 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) P_SSE3, P_SSSE3, P_PROC_SSSE3, - P_SSE4_a, - P_PROC_SSE4_a, + P_SSE4_A, + P_PROC_SSE4_A, P_SSE4_1, P_SSE4_2, P_PROC_SSE4_2, P_POPCNT, P_AVX, + P_PROC_AVX, + P_FMA4, + P_XOP, + P_PROC_XOP, + P_FMA, + P_PROC_FMA, P_AVX2, - P_FMA, - P_PROC_FMA + P_PROC_AVX2 }; enum feature_priority priority = P_ZERO; @@ -29986,11 +31137,15 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) {"sse", P_SSE}, {"sse2", P_SSE2}, {"sse3", P_SSE3}, + {"sse4a", P_SSE4_A}, {"ssse3", P_SSSE3}, {"sse4.1", P_SSE4_1}, {"sse4.2", P_SSE4_2}, {"popcnt", P_POPCNT}, {"avx", P_AVX}, + {"fma4", P_FMA4}, + {"xop", P_XOP}, + {"fma", P_FMA}, {"avx2", P_AVX2} }; @@ -30038,30 +31193,57 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) arg_str = "core2"; priority = P_PROC_SSSE3; break; - case PROCESSOR_COREI7: + case PROCESSOR_NEHALEM: + /* We translate "arch=corei7" and "arch=nehelam" to + "corei7" so that it will be mapped to M_INTEL_COREI7 + as cpu type to cover all M_INTEL_COREI7_XXXs. */ arg_str = "corei7"; priority = P_PROC_SSE4_2; break; - case PROCESSOR_COREI7_AVX: - arg_str = "corei7-avx"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_ATOM: - arg_str = "atom"; + case PROCESSOR_SANDYBRIDGE: + arg_str = "sandybridge"; + priority = P_PROC_AVX; + break; + case PROCESSOR_HASWELL: + arg_str = "haswell"; + priority = P_PROC_AVX2; + break; + case PROCESSOR_BONNELL: + arg_str = "bonnell"; priority = P_PROC_SSSE3; break; + case PROCESSOR_SILVERMONT: + arg_str = "silvermont"; + priority = P_PROC_SSE4_2; + break; case PROCESSOR_AMDFAM10: arg_str = "amdfam10h"; - priority = P_PROC_SSE4_a; + priority = P_PROC_SSE4_A; + break; + case PROCESSOR_BTVER1: + arg_str = "btver1"; + priority = P_PROC_SSE4_A; + break; + case PROCESSOR_BTVER2: + arg_str = "btver2"; + priority = P_PROC_AVX; break; case PROCESSOR_BDVER1: arg_str = "bdver1"; - priority = P_PROC_FMA; + priority = P_PROC_XOP; break; case PROCESSOR_BDVER2: arg_str = "bdver2"; priority = P_PROC_FMA; break; + case PROCESSOR_BDVER3: + arg_str = "bdver3"; + priority = P_PROC_FMA; + break; + case PROCESSOR_BDVER4: + arg_str = "bdver4"; + priority = P_PROC_AVX2; + break; } } @@ -30819,7 +32001,7 @@ ix86_generate_version_dispatcher_body (void *node_p) push_cfun (DECL_STRUCT_FUNCTION (resolver_decl)); - stack_vec<tree, 2> fn_ver_vec; + auto_vec<tree, 2> fn_ver_vec; for (versn_info = node_version_info->next; versn_info; versn_info = versn_info->next) @@ -30926,6 +32108,10 @@ fold_builtin_cpu (tree fndecl, tree *args) F_SSE4_2, F_AVX, F_AVX2, + F_SSE4_A, + F_FMA4, + F_XOP, + F_FMA, F_MAX }; @@ -30937,12 +32123,14 @@ fold_builtin_cpu (tree fndecl, tree *args) M_INTEL = 1, M_AMD, M_CPU_TYPE_START, - M_INTEL_ATOM, + M_INTEL_BONNELL, M_INTEL_CORE2, M_INTEL_COREI7, M_AMDFAM10H, M_AMDFAM15H, - M_INTEL_SLM, + M_INTEL_SILVERMONT, + M_AMD_BTVER1, + M_AMD_BTVER2, M_CPU_SUBTYPE_START, M_INTEL_COREI7_NEHALEM, M_INTEL_COREI7_WESTMERE, @@ -30953,7 +32141,9 @@ fold_builtin_cpu (tree fndecl, tree *args) M_AMDFAM15H_BDVER1, M_AMDFAM15H_BDVER2, M_AMDFAM15H_BDVER3, - M_AMDFAM15H_BDVER4 + M_AMDFAM15H_BDVER4, + M_INTEL_COREI7_IVYBRIDGE, + M_INTEL_COREI7_HASWELL }; static struct _arch_names_table @@ -30965,22 +32155,28 @@ fold_builtin_cpu (tree fndecl, tree *args) { {"amd", M_AMD}, {"intel", M_INTEL}, - {"atom", M_INTEL_ATOM}, - {"slm", M_INTEL_SLM}, + {"atom", M_INTEL_BONNELL}, + {"slm", M_INTEL_SILVERMONT}, {"core2", M_INTEL_CORE2}, {"corei7", M_INTEL_COREI7}, {"nehalem", M_INTEL_COREI7_NEHALEM}, {"westmere", M_INTEL_COREI7_WESTMERE}, {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE}, + {"ivybridge", M_INTEL_COREI7_IVYBRIDGE}, + {"haswell", M_INTEL_COREI7_HASWELL}, + {"bonnell", M_INTEL_BONNELL}, + {"silvermont", M_INTEL_SILVERMONT}, {"amdfam10h", M_AMDFAM10H}, {"barcelona", M_AMDFAM10H_BARCELONA}, {"shanghai", M_AMDFAM10H_SHANGHAI}, {"istanbul", M_AMDFAM10H_ISTANBUL}, + {"btver1", M_AMD_BTVER1}, {"amdfam15h", M_AMDFAM15H}, {"bdver1", M_AMDFAM15H_BDVER1}, {"bdver2", M_AMDFAM15H_BDVER2}, {"bdver3", M_AMDFAM15H_BDVER3}, {"bdver4", M_AMDFAM15H_BDVER4}, + {"btver2", M_AMD_BTVER2}, }; static struct _isa_names_table @@ -30997,9 +32193,13 @@ fold_builtin_cpu (tree fndecl, tree *args) {"sse2", F_SSE2}, {"sse3", F_SSE3}, {"ssse3", F_SSSE3}, + {"sse4a", F_SSE4_A}, {"sse4.1", F_SSE4_1}, {"sse4.2", F_SSE4_2}, {"avx", F_AVX}, + {"fma4", F_FMA4}, + {"xop", F_XOP}, + {"fma", F_FMA}, {"avx2", F_AVX2} }; @@ -32066,12 +33266,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, rtx pat, real_target; unsigned int i, nargs; unsigned int nargs_constant = 0; + unsigned int mask_pos = 0; int num_memory = 0; struct { rtx op; enum machine_mode mode; - } args[4]; + } args[6]; bool last_arg_count = false; enum insn_code icode = d->icode; const struct insn_data_d *insn_p = &insn_data[icode]; @@ -32091,6 +33292,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, return ix86_expand_sse_round (d, exp, target); case V4SI_FTYPE_V2DF_V2DF_ROUND: case V8SI_FTYPE_V4DF_V4DF_ROUND: + case V16SI_FTYPE_V8DF_V8DF_ROUND: return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); case INT_FTYPE_V8SF_V8SF_PTEST: case INT_FTYPE_V4DI_V4DI_PTEST: @@ -32169,6 +33371,32 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DI_FTYPE_V8HI: case V4DI_FTYPE_V4SI: case V4DI_FTYPE_V2DI: + case HI_FTYPE_HI: + case UINT_FTYPE_V2DF: + case UINT_FTYPE_V4SF: + case UINT64_FTYPE_V2DF: + case UINT64_FTYPE_V4SF: + case V16QI_FTYPE_V8DI: + case V16HI_FTYPE_V16SI: + case V16SI_FTYPE_HI: + case V16SI_FTYPE_V16SI: + case V16SI_FTYPE_INT: + case V16SF_FTYPE_FLOAT: + case V16SF_FTYPE_V4SF: + case V16SF_FTYPE_V16SF: + case V8HI_FTYPE_V8DI: + case V8UHI_FTYPE_V8UHI: + case V8SI_FTYPE_V8DI: + case V8USI_FTYPE_V8USI: + case V8SF_FTYPE_V8DF: + case V8DI_FTYPE_QI: + case V8DI_FTYPE_INT64: + case V8DI_FTYPE_V4DI: + case V8DI_FTYPE_V8DI: + case V8DF_FTYPE_DOUBLE: + case V8DF_FTYPE_V4DF: + case V8DF_FTYPE_V8DF: + case V8DF_FTYPE_V8SI: nargs = 1; break; case V4SF_FTYPE_V4SF_VEC_MERGE: @@ -32177,6 +33405,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case FLOAT128_FTYPE_FLOAT128_FLOAT128: case V16QI_FTYPE_V16QI_V16QI: case V16QI_FTYPE_V8HI_V8HI: + case V16SI_FTYPE_V16SI_V16SI: + case V16SF_FTYPE_V16SF_V16SF: + case V16SF_FTYPE_V16SF_V16SI: case V8QI_FTYPE_V8QI_V8QI: case V8QI_FTYPE_V4HI_V4HI: case V8HI_FTYPE_V8HI_V8HI: @@ -32184,6 +33415,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8HI_FTYPE_V4SI_V4SI: case V8SF_FTYPE_V8SF_V8SF: case V8SF_FTYPE_V8SF_V8SI: + case V8DI_FTYPE_V8DI_V8DI: + case V8DF_FTYPE_V8DF_V8DF: + case V8DF_FTYPE_V8DF_V8DI: case V4SI_FTYPE_V4SI_V4SI: case V4SI_FTYPE_V8HI_V8HI: case V4SI_FTYPE_V4SF_V4SF: @@ -32197,6 +33431,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4SF_FTYPE_V4SF_V4SI: case V4SF_FTYPE_V4SF_V2SI: case V4SF_FTYPE_V4SF_V2DF: + case V4SF_FTYPE_V4SF_UINT: + case V4SF_FTYPE_V4SF_UINT64: case V4SF_FTYPE_V4SF_DI: case V4SF_FTYPE_V4SF_SI: case V2DI_FTYPE_V2DI_V2DI: @@ -32213,6 +33449,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DI: case V2DF_FTYPE_V2DF_DI: case V2DF_FTYPE_V2DF_SI: + case V2DF_FTYPE_V2DF_UINT: + case V2DF_FTYPE_V2DF_UINT64: case V2SF_FTYPE_V2SF_V2SF: case V1DI_FTYPE_V1DI_V1DI: case V1DI_FTYPE_V8QI_V8QI: @@ -32228,6 +33466,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DI_FTYPE_V4DI_V4DI: case V4DI_FTYPE_V8SI_V8SI: case V4UDI_FTYPE_V8USI_V8USI: + case QI_FTYPE_V8DI_V8DI: + case HI_FTYPE_V16SI_V16SI: if (comparison == UNKNOWN) return ix86_expand_binop_builtin (icode, exp, target); nargs = 2; @@ -32265,6 +33505,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case UINT_FTYPE_UINT_UCHAR: case UINT16_FTYPE_UINT16_INT: case UINT8_FTYPE_UINT8_INT: + case HI_FTYPE_HI_HI: + case V16SI_FTYPE_V8DF_V8DF: nargs = 2; break; case V2DI_FTYPE_V2DI_INT_CONVERT: @@ -32279,12 +33521,16 @@ ix86_expand_args_builtin (const struct builtin_description *d, break; case V8HI_FTYPE_V8HI_INT: case V8HI_FTYPE_V8SF_INT: + case V16HI_FTYPE_V16SF_INT: case V8HI_FTYPE_V4SF_INT: case V8SF_FTYPE_V8SF_INT: + case V4SF_FTYPE_V16SF_INT: + case V16SF_FTYPE_V16SF_INT: case V4SI_FTYPE_V4SI_INT: case V4SI_FTYPE_V8SI_INT: case V4HI_FTYPE_V4HI_INT: case V4DF_FTYPE_V4DF_INT: + case V4DF_FTYPE_V8DF_INT: case V4SF_FTYPE_V4SF_INT: case V4SF_FTYPE_V8SF_INT: case V2DI_FTYPE_V2DI_INT: @@ -32292,8 +33538,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DF_FTYPE_V4DF_INT: case V16HI_FTYPE_V16HI_INT: case V8SI_FTYPE_V8SI_INT: + case V16SI_FTYPE_V16SI_INT: + case V4SI_FTYPE_V16SI_INT: case V4DI_FTYPE_V4DI_INT: case V2DI_FTYPE_V4DI_INT: + case V4DI_FTYPE_V8DI_INT: + case HI_FTYPE_HI_INT: nargs = 2; nargs_constant = 1; break; @@ -32303,6 +33553,47 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4SF_FTYPE_V4SF_V4SF_V4SF: case V2DF_FTYPE_V2DF_V2DF_V2DF: case V32QI_FTYPE_V32QI_V32QI_V32QI: + case HI_FTYPE_V16SI_V16SI_HI: + case QI_FTYPE_V8DI_V8DI_QI: + case V16HI_FTYPE_V16SI_V16HI_HI: + case V16QI_FTYPE_V16SI_V16QI_HI: + case V16QI_FTYPE_V8DI_V16QI_QI: + case V16SF_FTYPE_V16SF_V16SF_HI: + case V16SF_FTYPE_V16SF_V16SF_V16SF: + case V16SF_FTYPE_V16SF_V16SI_V16SF: + case V16SF_FTYPE_V16SI_V16SF_HI: + case V16SF_FTYPE_V16SI_V16SF_V16SF: + case V16SF_FTYPE_V4SF_V16SF_HI: + case V16SI_FTYPE_SI_V16SI_HI: + case V16SI_FTYPE_V16HI_V16SI_HI: + case V16SI_FTYPE_V16QI_V16SI_HI: + case V16SI_FTYPE_V16SF_V16SI_HI: + case V16SI_FTYPE_V16SI_V16SI_HI: + case V16SI_FTYPE_V16SI_V16SI_V16SI: + case V16SI_FTYPE_V4SI_V16SI_HI: + case V2DI_FTYPE_V2DI_V2DI_V2DI: + case V4DI_FTYPE_V4DI_V4DI_V4DI: + case V8DF_FTYPE_V2DF_V8DF_QI: + case V8DF_FTYPE_V4DF_V8DF_QI: + case V8DF_FTYPE_V8DF_V8DF_QI: + case V8DF_FTYPE_V8DF_V8DF_V8DF: + case V8DF_FTYPE_V8DF_V8DI_V8DF: + case V8DF_FTYPE_V8DI_V8DF_V8DF: + case V8DF_FTYPE_V8SF_V8DF_QI: + case V8DF_FTYPE_V8SI_V8DF_QI: + case V8DI_FTYPE_DI_V8DI_QI: + case V8DI_FTYPE_V16QI_V8DI_QI: + case V8DI_FTYPE_V2DI_V8DI_QI: + case V8DI_FTYPE_V4DI_V8DI_QI: + case V8DI_FTYPE_V8DI_V8DI_QI: + case V8DI_FTYPE_V8DI_V8DI_V8DI: + case V8DI_FTYPE_V8HI_V8DI_QI: + case V8DI_FTYPE_V8SI_V8DI_QI: + case V8HI_FTYPE_V8DI_V8HI_QI: + case V8SF_FTYPE_V8DF_V8SF_QI: + case V8SI_FTYPE_V8DF_V8SI_QI: + case V8SI_FTYPE_V8DI_V8SI_QI: + case V4SI_FTYPE_V4SI_V4SI_V4SI: nargs = 3; break; case V32QI_FTYPE_V32QI_V32QI_INT: @@ -32316,11 +33607,20 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8SF_FTYPE_V8SF_V4SF_INT: case V4SI_FTYPE_V4SI_V4SI_INT: case V4DF_FTYPE_V4DF_V4DF_INT: + case V16SF_FTYPE_V16SF_V16SF_INT: + case V16SF_FTYPE_V16SF_V4SF_INT: + case V16SI_FTYPE_V16SI_V4SI_INT: case V4DF_FTYPE_V4DF_V2DF_INT: case V4SF_FTYPE_V4SF_V4SF_INT: case V2DI_FTYPE_V2DI_V2DI_INT: case V4DI_FTYPE_V4DI_V2DI_INT: case V2DF_FTYPE_V2DF_V2DF_INT: + case QI_FTYPE_V8DI_V8DI_INT: + case QI_FTYPE_V8DF_V8DF_INT: + case QI_FTYPE_V2DF_V2DF_INT: + case QI_FTYPE_V4SF_V4SF_INT: + case HI_FTYPE_V16SI_V16SI_INT: + case HI_FTYPE_V16SF_V16SF_INT: nargs = 3; nargs_constant = 1; break; @@ -32343,11 +33643,36 @@ ix86_expand_args_builtin (const struct builtin_description *d, nargs = 3; nargs_constant = 2; break; + case V16SF_FTYPE_V16SF_V16SF_V16SF_HI: + case V16SF_FTYPE_V16SF_V16SI_V16SF_HI: + case V16SF_FTYPE_V16SI_V16SF_V16SF_HI: + case V16SI_FTYPE_V16SI_V16SI_V16SI_HI: + case V16SI_FTYPE_V16SI_V4SI_V16SI_HI: + case V2DF_FTYPE_V2DF_V2DF_V2DF_QI: + case V2DF_FTYPE_V2DF_V4SF_V2DF_QI: + case V4SF_FTYPE_V4SF_V2DF_V4SF_QI: + case V4SF_FTYPE_V4SF_V4SF_V4SF_QI: + case V8DF_FTYPE_V8DF_V8DF_V8DF_QI: + case V8DF_FTYPE_V8DF_V8DI_V8DF_QI: + case V8DF_FTYPE_V8DI_V8DF_V8DF_QI: + case V8DI_FTYPE_V16SI_V16SI_V8DI_QI: + case V8DI_FTYPE_V8DI_SI_V8DI_V8DI: + case V8DI_FTYPE_V8DI_V2DI_V8DI_QI: + case V8DI_FTYPE_V8DI_V8DI_V8DI_QI: + nargs = 4; + break; case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SI_INT: + nargs = 4; + nargs_constant = 1; + break; + case QI_FTYPE_V2DF_V2DF_INT_QI: + case QI_FTYPE_V4SF_V4SF_INT_QI: nargs = 4; + mask_pos = 1; nargs_constant = 1; break; case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: @@ -32358,6 +33683,51 @@ ix86_expand_args_builtin (const struct builtin_description *d, case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: nargs = 4; break; + case QI_FTYPE_V8DI_V8DI_INT_QI: + case HI_FTYPE_V16SI_V16SI_INT_HI: + case QI_FTYPE_V8DF_V8DF_INT_QI: + case HI_FTYPE_V16SF_V16SF_INT_HI: + mask_pos = 1; + nargs = 4; + nargs_constant = 1; + break; + case V8DF_FTYPE_V8DF_INT_V8DF_QI: + case V16SF_FTYPE_V16SF_INT_V16SF_HI: + case V16HI_FTYPE_V16SF_INT_V16HI_HI: + case V16SI_FTYPE_V16SI_INT_V16SI_HI: + case V4SI_FTYPE_V16SI_INT_V4SI_QI: + case V4DI_FTYPE_V8DI_INT_V4DI_QI: + case V4DF_FTYPE_V8DF_INT_V4DF_QI: + case V4SF_FTYPE_V16SF_INT_V4SF_QI: + case V8DI_FTYPE_V8DI_INT_V8DI_QI: + nargs = 4; + mask_pos = 2; + nargs_constant = 1; + break; + case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_HI: + case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_HI: + case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI: + case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI: + case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI: + case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI: + case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI: + case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI: + case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_QI: + case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_QI: + nargs = 5; + mask_pos = 2; + nargs_constant = 1; + break; + case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI: + case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI: + case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI: + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI: + nargs = 5; + mask_pos = 1; + nargs_constant = 1; + break; + default: gcc_unreachable (); } @@ -32404,7 +33774,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, op = copy_to_reg (op); } } - else if ((nargs - i) <= nargs_constant) + else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || + (!mask_pos && (nargs - i) <= nargs_constant)) { if (!match) switch (icode) @@ -32414,6 +33785,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, error ("the last argument must be an 1-bit immediate"); return const0_rtx; + case CODE_FOR_avx512f_cmpv8di3_mask: + case CODE_FOR_avx512f_cmpv16si3_mask: + case CODE_FOR_avx512f_ucmpv8di3_mask: + case CODE_FOR_avx512f_ucmpv16si3_mask: + error ("the last argument must be a 3-bit immediate"); + return const0_rtx; + case CODE_FOR_sse4_1_roundsd: case CODE_FOR_sse4_1_roundss: @@ -32430,15 +33808,22 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_sse4_1_blendps: case CODE_FOR_avx_blendpd256: case CODE_FOR_avx_vpermilv4df: + case CODE_FOR_avx512f_getmantv8df_mask: + case CODE_FOR_avx512f_getmantv16sf_mask: error ("the last argument must be a 4-bit immediate"); return const0_rtx; + case CODE_FOR_sha1rnds4: case CODE_FOR_sse4_1_blendpd: case CODE_FOR_avx_vpermilv2df: case CODE_FOR_xop_vpermil2v2df3: case CODE_FOR_xop_vpermil2v4sf3: case CODE_FOR_xop_vpermil2v4df3: case CODE_FOR_xop_vpermil2v8sf3: + case CODE_FOR_avx512f_vinsertf32x4_mask: + case CODE_FOR_avx512f_vinserti32x4_mask: + case CODE_FOR_avx512f_vextractf32x4_mask: + case CODE_FOR_avx512f_vextracti32x4_mask: error ("the last argument must be a 2-bit immediate"); return const0_rtx; @@ -32448,6 +33833,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_avx_vinsertf128v4df: case CODE_FOR_avx_vinsertf128v8sf: case CODE_FOR_avx_vinsertf128v8si: + case CODE_FOR_avx512f_vinsertf64x4_mask: + case CODE_FOR_avx512f_vinserti64x4_mask: + case CODE_FOR_avx512f_vextractf64x4_mask: + case CODE_FOR_avx512f_vextracti64x4_mask: error ("the last argument must be a 1-bit immediate"); return const0_rtx; @@ -32457,14 +33846,19 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_avx_cmpv4sf3: case CODE_FOR_avx_cmpv4df3: case CODE_FOR_avx_cmpv8sf3: + case CODE_FOR_avx512f_cmpv8df3_mask: + case CODE_FOR_avx512f_cmpv16sf3_mask: + case CODE_FOR_avx512f_vmcmpv2df3_mask: + case CODE_FOR_avx512f_vmcmpv4sf3_mask: error ("the last argument must be a 5-bit immediate"); return const0_rtx; - default: + default: switch (nargs_constant) { case 2: - if ((nargs - i) == nargs_constant) + if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || + (!mask_pos && (nargs - i) == nargs_constant)) { error ("the next to last argument must be an 8-bit immediate"); break; @@ -32520,6 +33914,14 @@ ix86_expand_args_builtin (const struct builtin_description *d, pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, args[2].op, args[3].op); break; + case 5: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op); + case 6: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op, + args[5].op); + break; default: gcc_unreachable (); } @@ -32531,6 +33933,376 @@ ix86_expand_args_builtin (const struct builtin_description *d, return target; } +/* Transform pattern of following layout: + (parallel [ + set (A B) + (unspec [C] UNSPEC_EMBEDDED_ROUNDING)]) + ]) + into: + (set (A B)) + + Or: + (parallel [ A B + ... + (unspec [C] UNSPEC_EMBEDDED_ROUNDING) + ... + ]) + into: + (parallel [ A B ... ]) */ + +static rtx +ix86_erase_embedded_rounding (rtx pat) +{ + if (GET_CODE (pat) == INSN) + pat = PATTERN (pat); + + gcc_assert (GET_CODE (pat) == PARALLEL); + + if (XVECLEN (pat, 0) == 2) + { + rtx p0 = XVECEXP (pat, 0, 0); + rtx p1 = XVECEXP (pat, 0, 1); + + gcc_assert (GET_CODE (p0) == SET + && GET_CODE (p1) == UNSPEC + && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING); + + return p0; + } + else + { + rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0)); + int i = 0; + int j = 0; + + for (; i < XVECLEN (pat, 0); ++i) + { + rtx elem = XVECEXP (pat, 0, i); + if (GET_CODE (elem) != UNSPEC + || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING) + res [j++] = elem; + } + + /* No more than 1 occurence was removed. */ + gcc_assert (j >= XVECLEN (pat, 0) - 1); + + return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res)); + } +} + +/* Subroutine of ix86_expand_round_builtin to take care of comi insns + with rounding. */ +static rtx +ix86_expand_sse_comi_round (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat, set_dst; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + tree arg3 = CALL_EXPR_ARG (exp, 3); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + rtx op3 = expand_normal (arg3); + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + enum machine_mode mode0 = insn_p->operand[0].mode; + enum machine_mode mode1 = insn_p->operand[1].mode; + enum rtx_code comparison = UNEQ; + bool need_ucomi = false; + + /* See avxintrin.h for values. */ + enum rtx_code comi_comparisons[32] = + { + UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT, + UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE, + UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT + }; + bool need_ucomi_values[32] = + { + true, false, false, true, true, false, false, true, + true, false, false, true, true, false, false, true, + false, true, true, false, false, true, true, false, + false, true, true, false, false, true, true, false + }; + + if (!CONST_INT_P (op2)) + { + error ("the third argument must be comparison constant"); + return const0_rtx; + } + if (INTVAL (op2) < 0 || INTVAL (op2) >= 32) + { + error ("incorect comparison mode"); + return const0_rtx; + } + + if (!insn_p->operand[2].predicate (op3, SImode)) + { + error ("incorrect rounding operand"); + return const0_rtx; + } + + comparison = comi_comparisons[INTVAL (op2)]; + need_ucomi = need_ucomi_values[INTVAL (op2)]; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_p->operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_p->operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + if (need_ucomi) + icode = icode == CODE_FOR_sse_comi_round + ? CODE_FOR_sse_ucomi_round + : CODE_FOR_sse2_ucomi_round; + + pat = GEN_FCN (icode) (op0, op1, op3); + if (! pat) + return 0; + + /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */ + if (INTVAL (op3) == NO_ROUND) + { + pat = ix86_erase_embedded_rounding (pat); + if (! pat) + return 0; + + set_dst = SET_DEST (pat); + } + else + { + gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET); + set_dst = SET_DEST (XVECEXP (pat, 0, 0)); + } + + emit_insn (pat); + emit_insn (gen_rtx_SET (VOIDmode, + gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + set_dst, + const0_rtx))); + + return SUBREG_REG (target); +} + +static rtx +ix86_expand_round_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + unsigned int i, nargs; + struct + { + rtx op; + enum machine_mode mode; + } args[6]; + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + enum machine_mode tmode = insn_p->operand[0].mode; + unsigned int nargs_constant = 0; + unsigned int redundant_embed_rnd = 0; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case UINT64_FTYPE_V2DF_INT: + case UINT64_FTYPE_V4SF_INT: + case UINT_FTYPE_V2DF_INT: + case UINT_FTYPE_V4SF_INT: + case INT64_FTYPE_V2DF_INT: + case INT64_FTYPE_V4SF_INT: + case INT_FTYPE_V2DF_INT: + case INT_FTYPE_V4SF_INT: + nargs = 2; + break; + case V4SF_FTYPE_V4SF_UINT_INT: + case V4SF_FTYPE_V4SF_UINT64_INT: + case V2DF_FTYPE_V2DF_UINT64_INT: + case V4SF_FTYPE_V4SF_INT_INT: + case V4SF_FTYPE_V4SF_INT64_INT: + case V2DF_FTYPE_V2DF_INT64_INT: + case V4SF_FTYPE_V4SF_V4SF_INT: + case V2DF_FTYPE_V2DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V2DF_INT: + case V2DF_FTYPE_V2DF_V4SF_INT: + nargs = 3; + break; + case V8SF_FTYPE_V8DF_V8SF_QI_INT: + case V8DF_FTYPE_V8DF_V8DF_QI_INT: + case V8SI_FTYPE_V8DF_V8SI_QI_INT: + case V16SF_FTYPE_V16SF_V16SF_HI_INT: + case V16SF_FTYPE_V16SI_V16SF_HI_INT: + case V16SI_FTYPE_V16SF_V16SI_HI_INT: + case V8DF_FTYPE_V8SF_V8DF_QI_INT: + case V16SF_FTYPE_V16HI_V16SF_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: + nargs = 4; + break; + case V4SF_FTYPE_V4SF_V4SF_INT_INT: + case V2DF_FTYPE_V2DF_V2DF_INT_INT: + nargs_constant = 2; + nargs = 4; + break; + case INT_FTYPE_V4SF_V4SF_INT_INT: + case INT_FTYPE_V2DF_V2DF_INT_INT: + return ix86_expand_sse_comi_round (d, exp, target); + case V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: + case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: + case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: + nargs = 5; + break; + case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: + case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: + nargs_constant = 4; + nargs = 5; + break; + case QI_FTYPE_V8DF_V8DF_INT_QI_INT: + case QI_FTYPE_V2DF_V2DF_INT_QI_INT: + case HI_FTYPE_V16SF_V16SF_INT_HI_INT: + case QI_FTYPE_V4SF_V4SF_INT_QI_INT: + nargs_constant = 3; + nargs = 5; + break; + case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: + nargs = 6; + nargs_constant = 4; + break; + case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT: + nargs = 6; + nargs_constant = 3; + break; + default: + gcc_unreachable (); + } + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (optimize + || target == 0 + || GET_MODE (target) != tmode + || !insn_p->operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + enum machine_mode mode = insn_p->operand[i + 1].mode; + bool match = insn_p->operand[i + 1].predicate (op, mode); + + if (i == nargs - nargs_constant) + { + if (!match) + { + switch (icode) + { + case CODE_FOR_avx512f_getmantv8df_mask_round: + case CODE_FOR_avx512f_getmantv16sf_mask_round: + case CODE_FOR_avx512f_getmantv2df_round: + case CODE_FOR_avx512f_getmantv4sf_round: + error ("the immediate argument must be a 4-bit immediate"); + return const0_rtx; + case CODE_FOR_avx512f_cmpv8df3_mask_round: + case CODE_FOR_avx512f_cmpv16sf3_mask_round: + case CODE_FOR_avx512f_vmcmpv2df3_mask_round: + case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: + error ("the immediate argument must be a 5-bit immediate"); + return const0_rtx; + default: + error ("the immediate argument must be an 8-bit immediate"); + return const0_rtx; + } + } + } + else if (i == nargs-1) + { + if (!insn_p->operand[nargs].predicate (op, SImode)) + { + error ("incorrect rounding operand"); + return const0_rtx; + } + + /* If there is no rounding use normal version of the pattern. */ + if (INTVAL (op) == NO_ROUND) + redundant_embed_rnd = 1; + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + { + if (optimize || !match) + op = copy_to_mode_reg (mode, op); + } + else + { + op = copy_to_reg (op); + op = simplify_gen_subreg (mode, op, GET_MODE (op), 0); + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op); + break; + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op); + break; + case 5: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op); + case 6: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op, + args[5].op); + break; + default: + gcc_unreachable (); + } + + if (!pat) + return 0; + + if (redundant_embed_rnd) + pat = ix86_erase_embedded_rounding (pat); + + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_builtin to take care of special insns with variable number of operands. */ @@ -32584,6 +34356,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case V4DF_FTYPE_PCDOUBLE: case V2DF_FTYPE_PCDOUBLE: case VOID_FTYPE_PVOID: + case V16SI_FTYPE_PV4SI: + case V16SF_FTYPE_PV4SF: + case V8DI_FTYPE_PV4DI: + case V8DF_FTYPE_PV4DF: nargs = 1; klass = load; memory = 0; @@ -32598,12 +34374,15 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, } break; case VOID_FTYPE_PV2SF_V4SF: + case VOID_FTYPE_PV8DI_V8DI: case VOID_FTYPE_PV4DI_V4DI: case VOID_FTYPE_PV2DI_V2DI: case VOID_FTYPE_PCHAR_V32QI: case VOID_FTYPE_PCHAR_V16QI: + case VOID_FTYPE_PFLOAT_V16SF: case VOID_FTYPE_PFLOAT_V8SF: case VOID_FTYPE_PFLOAT_V4SF: + case VOID_FTYPE_PDOUBLE_V8DF: case VOID_FTYPE_PDOUBLE_V4DF: case VOID_FTYPE_PDOUBLE_V2DF: case VOID_FTYPE_PLONGLONG_LONGLONG: @@ -32660,11 +34439,27 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PV4DI_V4DI_V4DI: case VOID_FTYPE_PV4SI_V4SI_V4SI: case VOID_FTYPE_PV2DI_V2DI_V2DI: + case VOID_FTYPE_PV8DF_V8DF_QI: + case VOID_FTYPE_PV16SF_V16SF_HI: + case VOID_FTYPE_PV8DI_V8DI_QI: + case VOID_FTYPE_PV16SI_V16SI_HI: + case VOID_FTYPE_PDOUBLE_V2DF_QI: + case VOID_FTYPE_PFLOAT_V4SF_QI: nargs = 2; klass = store; /* Reserve memory operand for target. */ memory = ARRAY_SIZE (args); break; + case V16SF_FTYPE_PCV16SF_V16SF_HI: + case V16SI_FTYPE_PCV16SI_V16SI_HI: + case V8DF_FTYPE_PCV8DF_V8DF_QI: + case V8DI_FTYPE_PCV8DI_V8DI_QI: + case V2DF_FTYPE_PCDOUBLE_V2DF_QI: + case V4SF_FTYPE_PCFLOAT_V4SF_QI: + nargs = 3; + klass = load; + memory = 0; + break; case VOID_FTYPE_UINT_UINT_UINT: case VOID_FTYPE_UINT64_UINT_UINT: case UCHAR_FTYPE_UINT_UINT_UINT: @@ -32763,9 +34558,13 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, if (VECTOR_MODE_P (mode)) op = safe_vector_operand (op, mode); - gcc_assert (GET_MODE (op) == mode - || GET_MODE (op) == VOIDmode); - op = copy_to_mode_reg (mode, op); + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + op = copy_to_mode_reg (mode, op); + else + { + op = copy_to_reg (op); + op = simplify_gen_subreg (mode, op, GET_MODE (op), 0); + } } } @@ -33506,6 +35305,38 @@ addcarryx: emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG))); return 0; + case IX86_BUILTIN_KORTESTC16: + icode = CODE_FOR_kortestchi; + mode0 = HImode; + mode1 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KORTESTZ16: + icode = CODE_FOR_kortestzhi; + mode0 = HImode; + mode1 = CCZmode; + + kortest: + arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */ + arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */ + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + op0 = copy_to_reg (op0); + op0 = simplify_gen_subreg (mode0, op0, GET_MODE (op0), 0); + op1 = copy_to_reg (op1); + op1 = simplify_gen_subreg (mode0, op1, GET_MODE (op1), 0); + + target = gen_reg_rtx (QImode); + emit_insn (gen_rtx_SET (mode0, target, const0_rtx)); + + /* Emit kortest. */ + emit_insn (GEN_FCN (icode) (op0, op1)); + /* And use setcc to return result from flags. */ + ix86_expand_setcc (target, EQ, + gen_rtx_REG (mode1, FLAGS_REG), const0_rtx); + return target; + case IX86_BUILTIN_GATHERSIV2DF: icode = CODE_FOR_avx2_gathersiv2df; goto gather_gen; @@ -33566,8 +35397,83 @@ addcarryx: case IX86_BUILTIN_GATHERALTDIV8SI: icode = CODE_FOR_avx2_gatherdiv8si; goto gather_gen; + case IX86_BUILTIN_GATHER3SIV16SF: + icode = CODE_FOR_avx512f_gathersiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8DF: + icode = CODE_FOR_avx512f_gathersiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV16SF: + icode = CODE_FOR_avx512f_gatherdiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8DF: + icode = CODE_FOR_avx512f_gatherdiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV16SI: + icode = CODE_FOR_avx512f_gathersiv16si; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8DI: + icode = CODE_FOR_avx512f_gathersiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV16SI: + icode = CODE_FOR_avx512f_gatherdiv16si; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8DI: + icode = CODE_FOR_avx512f_gatherdiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV8DF: + icode = CODE_FOR_avx512f_gathersiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV16SF: + icode = CODE_FOR_avx512f_gatherdiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV8DI: + icode = CODE_FOR_avx512f_gathersiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV16SI: + icode = CODE_FOR_avx512f_gatherdiv16si; + goto gather_gen; + case IX86_BUILTIN_SCATTERSIV16SF: + icode = CODE_FOR_avx512f_scattersiv16sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8DF: + icode = CODE_FOR_avx512f_scattersiv8df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV16SF: + icode = CODE_FOR_avx512f_scatterdiv16sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8DF: + icode = CODE_FOR_avx512f_scatterdiv8df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV16SI: + icode = CODE_FOR_avx512f_scattersiv16si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8DI: + icode = CODE_FOR_avx512f_scattersiv8di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV16SI: + icode = CODE_FOR_avx512f_scatterdiv16si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8DI: + icode = CODE_FOR_avx512f_scatterdiv8di; + goto scatter_gen; + case IX86_BUILTIN_GATHERPFDPS: + icode = CODE_FOR_avx512pf_gatherpfv16si; + goto vec_prefetch_gen; + case IX86_BUILTIN_GATHERPFQPS: + icode = CODE_FOR_avx512pf_gatherpfv8di; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFDPS: + icode = CODE_FOR_avx512pf_scatterpfv16si; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFQPS: + icode = CODE_FOR_avx512pf_scatterpfv8di; + goto vec_prefetch_gen; gather_gen: + rtx half; + rtx (*gen) (rtx, rtx); + arg0 = CALL_EXPR_ARG (exp, 0); arg1 = CALL_EXPR_ARG (exp, 1); arg2 = CALL_EXPR_ARG (exp, 2); @@ -33590,20 +35496,46 @@ addcarryx: else subtarget = target; - if (fcode == IX86_BUILTIN_GATHERALTSIV4DF - || fcode == IX86_BUILTIN_GATHERALTSIV4DI) + switch (fcode) { - rtx half = gen_reg_rtx (V4SImode); + case IX86_BUILTIN_GATHER3ALTSIV8DF: + case IX86_BUILTIN_GATHER3ALTSIV8DI: + half = gen_reg_rtx (V8SImode); + if (!nonimmediate_operand (op2, V16SImode)) + op2 = copy_to_mode_reg (V16SImode, op2); + emit_insn (gen_vec_extract_lo_v16si (half, op2)); + op2 = half; + break; + case IX86_BUILTIN_GATHERALTSIV4DF: + case IX86_BUILTIN_GATHERALTSIV4DI: + half = gen_reg_rtx (V4SImode); if (!nonimmediate_operand (op2, V8SImode)) op2 = copy_to_mode_reg (V8SImode, op2); emit_insn (gen_vec_extract_lo_v8si (half, op2)); op2 = half; - } - else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF - || fcode == IX86_BUILTIN_GATHERALTDIV8SI) - { - rtx (*gen) (rtx, rtx); - rtx half = gen_reg_rtx (mode0); + break; + case IX86_BUILTIN_GATHER3ALTDIV16SF: + case IX86_BUILTIN_GATHER3ALTDIV16SI: + half = gen_reg_rtx (mode0); + if (mode0 == V8SFmode) + gen = gen_vec_extract_lo_v16sf; + else + gen = gen_vec_extract_lo_v16si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + if (GET_MODE (op3) != VOIDmode) + { + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } + break; + case IX86_BUILTIN_GATHERALTDIV8SF: + case IX86_BUILTIN_GATHERALTDIV8SI: + half = gen_reg_rtx (mode0); if (mode0 == V4SFmode) gen = gen_vec_extract_lo_v8sf; else @@ -33612,10 +35544,16 @@ addcarryx: op0 = copy_to_mode_reg (GET_MODE (op0), op0); emit_insn (gen (half, op0)); op0 = half; - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; + if (GET_MODE (op3) != VOIDmode) + { + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } + break; + default: + break; } /* Force memory operand only with base register here. But we @@ -33629,11 +35567,19 @@ addcarryx: op1 = copy_to_mode_reg (Pmode, op1); if (!insn_data[icode].operand[3].predicate (op2, mode2)) op2 = copy_to_mode_reg (mode2, op2); - if (!insn_data[icode].operand[4].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); + if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode) + { + if (!insn_data[icode].operand[4].predicate (op3, mode3)) + op3 = copy_to_mode_reg (mode3, op3); + } + else + { + op3 = copy_to_reg (op3); + op3 = simplify_gen_subreg (mode3, op3, GET_MODE (op3), 0); + } if (!insn_data[icode].operand[5].predicate (op4, mode4)) { - error ("last argument must be scale 1, 2, 4, 8"); + error ("the last argument must be scale 1, 2, 4, 8"); return const0_rtx; } @@ -33643,7 +35589,12 @@ addcarryx: previous contents. */ if (optimize) { - if (TREE_CODE (arg3) == VECTOR_CST) + if (TREE_CODE (arg3) == INTEGER_CST) + { + if (integer_all_onesp (arg3)) + op0 = pc_rtx; + } + else if (TREE_CODE (arg3) == VECTOR_CST) { unsigned int negative = 0; for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i) @@ -33659,7 +35610,8 @@ addcarryx: if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) op0 = pc_rtx; } - else if (TREE_CODE (arg3) == SSA_NAME) + else if (TREE_CODE (arg3) == SSA_NAME + && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE) { /* Recognize also when mask is like: __v2df src = _mm_setzero_pd (); @@ -33704,22 +35656,146 @@ addcarryx: return const0_rtx; emit_insn (pat); - if (fcode == IX86_BUILTIN_GATHERDIV8SF - || fcode == IX86_BUILTIN_GATHERDIV8SI) + switch (fcode) { - enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode - ? V4SFmode : V4SImode; + case IX86_BUILTIN_GATHER3DIV16SF: if (target == NULL_RTX) - target = gen_reg_rtx (tmode); - if (tmode == V4SFmode) - emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); - else - emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + target = gen_reg_rtx (V8SFmode); + emit_insn (gen_vec_extract_lo_v16sf (target, subtarget)); + break; + case IX86_BUILTIN_GATHER3DIV16SI: + if (target == NULL_RTX) + target = gen_reg_rtx (V8SImode); + emit_insn (gen_vec_extract_lo_v16si (target, subtarget)); + break; + case IX86_BUILTIN_GATHERDIV8SF: + if (target == NULL_RTX) + target = gen_reg_rtx (V4SFmode); + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); + break; + case IX86_BUILTIN_GATHERDIV8SI: + if (target == NULL_RTX) + target = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + break; + default: + target = subtarget; + break; + } + return target; + + scatter_gen: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1)); + + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = copy_to_mode_reg (Pmode, op0); + + if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode) + { + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); } else - target = subtarget; + { + op1 = copy_to_reg (op1); + op1 = simplify_gen_subreg (mode1, op1, GET_MODE (op1), 0); + } - return target; + if (!insn_data[icode].operand[2].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + + if (!insn_data[icode].operand[3].predicate (op3, mode3)) + op3 = copy_to_mode_reg (mode3, op3); + + if (!insn_data[icode].operand[4].predicate (op4, mode4)) + { + error ("the last argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + + pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + + emit_insn (pat); + return 0; + + vec_prefetch_gen: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + mode0 = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + if (GET_MODE (op0) == mode0 + || (GET_MODE (op0) == VOIDmode && op0 != constm1_rtx)) + { + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + } + else if (op0 != constm1_rtx) + { + op0 = copy_to_reg (op0); + op0 = simplify_gen_subreg (mode0, op0, GET_MODE (op0), 0); + } + + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1)); + + if (!insn_data[icode].operand[2].predicate (op2, Pmode)) + op2 = copy_to_mode_reg (Pmode, op2); + + if (!insn_data[icode].operand[3].predicate (op3, mode3)) + { + error ("the forth argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + + if (!insn_data[icode].operand[4].predicate (op4, mode4)) + { + error ("the last argument must be hint 0 or 1"); + return const0_rtx; + } + + pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + + emit_insn (pat); + + return 0; case IX86_BUILTIN_XABORT: icode = CODE_FOR_xabort; @@ -33763,6 +35839,10 @@ addcarryx: if (d->code == fcode) return ix86_expand_sse_comi (d, exp, target); + for (i = 0, d = bdesc_round_args; i < ARRAY_SIZE (bdesc_round_args); i++, d++) + if (d->code == fcode) + return ix86_expand_round_builtin (d, exp, target); + for (i = 0, d = bdesc_pcmpestr; i < ARRAY_SIZE (bdesc_pcmpestr); i++, d++) @@ -33840,6 +35920,16 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_SQRTPD); else if (out_n == 4 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_SQRTPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_SQRTPD512); + } + break; + + case BUILT_IN_EXP2F: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_EXP2PS); } break; @@ -33850,6 +35940,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR); else if (out_n == 8 && in_n == 8) return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR512); } break; @@ -33866,6 +35958,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); } break; @@ -33898,6 +35992,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); } break; @@ -33954,6 +36050,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX); else if (out_n == 8 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); } break; @@ -33980,6 +36078,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD); else if (out_n == 4 && in_n == 4) return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD512); } break; @@ -33990,6 +36090,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS); else if (out_n == 8 && in_n == 8) return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS512); } break; @@ -34425,6 +36527,34 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, case V8SImode: code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; break; +#if 0 + /* FIXME: Commented until vectorizer can work with (mask_type != src_type) + PR59617. */ + case V8DFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF; + else + return NULL_TREE; + break; + case V8DImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI; + else + return NULL_TREE; + break; + case V16SFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF; + else + return NULL_TREE; + break; + case V16SImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI; + else + return NULL_TREE; + break; +#endif default: return NULL_TREE; } @@ -34480,7 +36610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) { unsigned i, nelt = GET_MODE_NUNITS (mode); unsigned mask = 0; - unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ + unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */ if (XVECLEN (par, 0) != (int) nelt) return 0; @@ -34503,6 +36633,24 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) switch (mode) { + case V8DFmode: + /* In the 512-bit DFmode case, we can only move elements within + a 128-bit lane. First fill the second part of the mask, + then fallthru. */ + for (i = 4; i < 6; ++i) + { + if (ipar[i] < 4 || ipar[i] >= 6) + return 0; + mask |= (ipar[i] - 4) << i; + } + for (i = 6; i < 8; ++i) + { + if (ipar[i] < 6) + return 0; + mask |= (ipar[i] - 6) << i; + } + /* FALLTHRU */ + case V4DFmode: /* In the 256-bit DFmode case, we can only move elements within a 128-bit lane. */ @@ -34520,10 +36668,18 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) } break; + case V16SFmode: + /* In 512 bit SFmode case, permutation in the upper 256 bits + must mirror the permutation in the lower 256-bits. */ + for (i = 0; i < 8; ++i) + if (ipar[i] + 8 != ipar[i + 8]) + return 0; + /* FALLTHRU */ + case V8SFmode: - /* In the 256-bit SFmode case, we have full freedom of movement - within the low 128-bit lane, but the high 128-bit lane must - mirror the exact same pattern. */ + /* In 256 bit SFmode case, we have full freedom of + movement within the low 128-bit lane, but the high 128-bit + lane must mirror the exact same pattern. */ for (i = 0; i < 4; ++i) if (ipar[i] + 4 != ipar[i + 4]) return 0; @@ -35474,6 +37630,7 @@ static bool ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, bool speed) { + rtx mask; enum rtx_code code = (enum rtx_code) code_i; enum rtx_code outer_code = (enum rtx_code) outer_code_i; enum machine_mode mode = GET_MODE (x); @@ -35950,13 +38107,21 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, case VEC_SELECT: case VEC_CONCAT: - case VEC_MERGE: case VEC_DUPLICATE: /* ??? Assume all of these vector manipulation patterns are recognizable. In which case they all pretty much have the same cost. */ *total = cost->fabs; return true; + case VEC_MERGE: + mask = XEXP (x, 2); + /* This is masked instruction, assume the same cost, + as nonmasked variant. */ + if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) + *total = rtx_cost (XEXP (x, 0), outer_code, opno, speed); + else + *total = cost->fabs; + return true; default: return false; @@ -36660,7 +38825,10 @@ ix86_avoid_jump_mispredicts (void) The smallest offset in the page INSN can start is the case where START ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN). We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN). - */ + + Don't consider asm goto as jump, while it can contain a jump, it doesn't + have to, control transfer to label(s) can be performed through other + means, and also we estimate minimum length of all asm stmts as 0. */ for (insn = start; insn; insn = NEXT_INSN (insn)) { int min_size; @@ -36687,7 +38855,8 @@ ix86_avoid_jump_mispredicts (void) while (nbytes + max_skip >= 16) { start = NEXT_INSN (start); - if (JUMP_P (start) || CALL_P (start)) + if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) + || CALL_P (start)) njumps--, isjump = 1; else isjump = 0; @@ -36702,7 +38871,8 @@ ix86_avoid_jump_mispredicts (void) if (dump_file) fprintf (dump_file, "Insn %i estimated to %i bytes\n", INSN_UID (insn), min_size); - if (JUMP_P (insn) || CALL_P (insn)) + if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0) + || CALL_P (insn)) njumps++; else continue; @@ -36710,7 +38880,8 @@ ix86_avoid_jump_mispredicts (void) while (njumps > 3) { start = NEXT_INSN (start); - if (JUMP_P (start) || CALL_P (start)) + if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) + || CALL_P (start)) njumps--, isjump = 1; else isjump = 0; @@ -37122,6 +39293,36 @@ get_mode_wider_vector (enum machine_mode o) return n; } +/* A subroutine of ix86_expand_vector_init_duplicate. Tries to + fill target with val via vec_duplicate. */ + +static bool +ix86_vector_duplicate_value (enum machine_mode mode, rtx target, rtx val) +{ + bool ok; + rtx insn, dup; + + /* First attempt to recognize VAL as-is. */ + dup = gen_rtx_VEC_DUPLICATE (mode, val); + insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); + if (recog_memoized (insn) < 0) + { + rtx seq; + /* If that fails, force VAL into a register. */ + + start_sequence (); + XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); + seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + + ok = recog_memoized (insn) >= 0; + gcc_assert (ok); + } + return true; +} + /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector with all elements equal to VAR. Return true if successful. */ @@ -37147,29 +39348,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, case V2DImode: case V4SFmode: case V4SImode: - { - rtx insn, dup; - - /* First attempt to recognize VAL as-is. */ - dup = gen_rtx_VEC_DUPLICATE (mode, val); - insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); - if (recog_memoized (insn) < 0) - { - rtx seq; - /* If that fails, force VAL into a register. */ - - start_sequence (); - XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); - seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - - ok = recog_memoized (insn) >= 0; - gcc_assert (ok); - } - } - return true; + case V16SImode: + case V8DImode: + case V16SFmode: + case V8DFmode: + return ix86_vector_duplicate_value (mode, target, val); case V4HImode: if (!mmx_ok) @@ -37519,8 +39702,8 @@ static void ix86_expand_vector_init_concat (enum machine_mode mode, rtx target, rtx *ops, int n) { - enum machine_mode cmode, hmode = VOIDmode; - rtx first[8], second[4]; + enum machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode; + rtx first[16], second[8], third[4]; rtvec v; int i, j; @@ -37529,6 +39712,18 @@ ix86_expand_vector_init_concat (enum machine_mode mode, case 2: switch (mode) { + case V16SImode: + cmode = V8SImode; + break; + case V16SFmode: + cmode = V8SFmode; + break; + case V8DImode: + cmode = V4DImode; + break; + case V8DFmode: + cmode = V4DFmode; + break; case V8SImode: cmode = V4SImode; break; @@ -37595,6 +39790,14 @@ ix86_expand_vector_init_concat (enum machine_mode mode, case 8: switch (mode) { + case V8DImode: + cmode = V2DImode; + hmode = V4DImode; + break; + case V8DFmode: + cmode = V2DFmode; + hmode = V4DFmode; + break; case V8SImode: cmode = V2SImode; hmode = V4SImode; @@ -37608,6 +39811,24 @@ ix86_expand_vector_init_concat (enum machine_mode mode, } goto half; + case 16: + switch (mode) + { + case V16SImode: + cmode = V2SImode; + hmode = V4SImode; + gmode = V8SImode; + break; + case V16SFmode: + cmode = V2SFmode; + hmode = V4SFmode; + gmode = V8SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + half: /* FIXME: We process inputs backward to help RA. PR 36222. */ i = n - 1; @@ -37621,7 +39842,27 @@ half: } n >>= 1; - if (n > 2) + if (n > 4) + { + gcc_assert (hmode != VOIDmode); + gcc_assert (gmode != VOIDmode); + for (i = j = 0; i < n; i += 2, j++) + { + second[j] = gen_reg_rtx (hmode); + ix86_expand_vector_init_concat (hmode, second [j], + &first [i], 2); + } + n >>= 1; + for (i = j = 0; i < n; i += 2, j++) + { + third[j] = gen_reg_rtx (gmode); + ix86_expand_vector_init_concat (gmode, third[j], + &second[i], 2); + } + n >>= 1; + ix86_expand_vector_init_concat (mode, target, third, n); + } + else if (n > 2) { gcc_assert (hmode != VOIDmode); for (i = j = 0; i < n; i += 2, j++) @@ -37764,7 +40005,7 @@ static void ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, rtx target, rtx vals) { - rtx ops[32], op0, op1; + rtx ops[64], op0, op1; enum machine_mode half_mode = VOIDmode; int n, i; @@ -37776,6 +40017,10 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, break; /* FALLTHRU */ + case V16SImode: + case V16SFmode: + case V8DFmode: + case V8DImode: case V8SFmode: case V8SImode: case V4DFmode: @@ -38401,6 +40646,42 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) } break; + case V16SFmode: + tmp = gen_reg_rtx (V8SFmode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case V8DFmode: + tmp = gen_reg_rtx (V4DFmode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + + case V16SImode: + tmp = gen_reg_rtx (V8SImode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case V8DImode: + tmp = gen_reg_rtx (V4DImode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + case V8QImode: /* ??? Could extract the appropriate HImode element and shift. */ default: @@ -38493,6 +40774,44 @@ emit_reduc_half (rtx dest, rtx src, int i) GEN_INT (i / 2)); } break; + case V16SImode: + case V16SFmode: + case V8DImode: + case V8DFmode: + if (i > 128) + tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + gen_lowpart (V16SImode, src), + GEN_INT (0x4 + (i == 512 ? 4 : 0)), + GEN_INT (0x5 + (i == 512 ? 4 : 0)), + GEN_INT (0x6 + (i == 512 ? 4 : 0)), + GEN_INT (0x7 + (i == 512 ? 4 : 0)), + GEN_INT (0xC), GEN_INT (0xD), + GEN_INT (0xE), GEN_INT (0xF), + GEN_INT (0x10), GEN_INT (0x11), + GEN_INT (0x12), GEN_INT (0x13), + GEN_INT (0x14), GEN_INT (0x15), + GEN_INT (0x16), GEN_INT (0x17)); + else + tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + GEN_INT (i == 128 ? 0x2 : 0x1), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (i == 128 ? 0x6 : 0x5), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (i == 128 ? 0xA : 0x9), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (i == 128 ? 0xE : 0xD), + GEN_INT (0xF), + GEN_INT (0xF), + GEN_INT (0xF)); + break; default: gcc_unreachable (); } @@ -38557,6 +40876,8 @@ ix86_vector_mode_supported_p (enum machine_mode mode) return true; if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) return true; + if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) + return true; if (TARGET_MMX && VALID_MMX_REG_MODE (mode)) return true; if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode)) @@ -38870,9 +41191,15 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) b = force_reg (mode, b); /* x0 = rcp(b) estimate */ - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP))); + if (mode == V16SFmode || mode == V8DFmode) + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP14))); + else + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP))); + /* e0 = x0 * b */ emit_insn (gen_rtx_SET (VOIDmode, e0, gen_rtx_MULT (mode, x0, b))); @@ -38902,6 +41229,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, { rtx x0, e0, e1, e2, e3, mthree, mhalf; REAL_VALUE_TYPE r; + int unspec; x0 = gen_reg_rtx (mode); e0 = gen_reg_rtx (mode); @@ -38914,11 +41242,15 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); + unspec = UNSPEC_RSQRT; if (VECTOR_MODE_P (mode)) { mthree = ix86_build_const_vector (mode, true, mthree); mhalf = ix86_build_const_vector (mode, true, mhalf); + /* There is no 512-bit rsqrt. There is however rsqrt14. */ + if (GET_MODE_SIZE (mode) == 64) + unspec = UNSPEC_RSQRT14; } /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) @@ -38929,7 +41261,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, /* x0 = rsqrt(a) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT))); + unspec))); /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ if (!recip) @@ -38940,11 +41272,23 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, mask = gen_reg_rtx (mode); zero = force_reg (mode, CONST0_RTX(mode)); - emit_insn (gen_rtx_SET (VOIDmode, mask, - gen_rtx_NE (mode, zero, a))); - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_AND (mode, x0, mask))); + /* Handle masked compare. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + { + mask = gen_reg_rtx (HImode); + /* Imm value 0x4 corresponds to not-equal comparison. */ + emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); + emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); + } + else + { + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_NE (mode, zero, a))); + + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_AND (mode, x0, mask))); + } } /* e0 = x0 * a */ @@ -40466,6 +42810,19 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshufb (d)) return true; + /* Try the AVX512F vpermi2 instructions. */ + rtx vec[64]; + enum machine_mode mode = d->vmode; + if (mode == V8DFmode) + mode = V8DImode; + else if (mode == V16SFmode) + mode = V16SImode; + for (i = 0; i < nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1)) + return true; + return false; } @@ -42073,6 +44430,10 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ + if (d.vmode == V16SImode || d.vmode == V16SFmode + || d.vmode == V8DFmode || d.vmode == V8DImode) + /* All implementable with a single vpermi2 insn. */ + return true; if (GET_MODE_SIZE (d.vmode) == 16) { /* All implementable with a single vpperm insn. */ @@ -42315,7 +44676,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, op2 = force_reg (mode, op2); /* We only play even/odd games with vectors of SImode. */ - gcc_assert (mode == V4SImode || mode == V8SImode); + gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); /* If we're looking for the odd results, shift those members down to the even slots. For some cpus this is faster than a PSHUFD. */ @@ -42341,7 +44702,14 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, op2 = gen_lowpart (mode, op2); } - if (mode == V8SImode) + if (mode == V16SImode) + { + if (uns_p) + x = gen_vec_widen_umult_even_v16si (dest, op1, op2); + else + x = gen_vec_widen_smult_even_v16si (dest, op1, op2); + } + else if (mode == V8SImode) { if (uns_p) x = gen_vec_widen_umult_even_v8si (dest, op1, op2); @@ -42561,6 +44929,11 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) umul = gen_vec_widen_umult_even_v8si; nmode = V8SImode; } + else if (mode == V8DImode) + { + umul = gen_vec_widen_umult_even_v16si; + nmode = V16SImode; + } else gcc_unreachable (); @@ -43707,12 +46080,16 @@ ix86_preferred_simd_mode (enum machine_mode mode) case HImode: return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode; case SImode: - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode; + return TARGET_AVX512F ? V16SImode : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode; case DImode: - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode; + return TARGET_AVX512F ? V8DImode : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode; case SFmode: - if (TARGET_AVX && !TARGET_PREFER_AVX128) + if (TARGET_AVX512F) + return V16SFmode; + else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SFmode; else return V4SFmode; @@ -43720,6 +46097,8 @@ ix86_preferred_simd_mode (enum machine_mode mode) case DFmode: if (!TARGET_VECTORIZE_DOUBLE) return word_mode; + else if (TARGET_AVX512F) + return V8DFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DFmode; else if (TARGET_SSE2) @@ -43732,12 +46111,14 @@ ix86_preferred_simd_mode (enum machine_mode mode) } /* If AVX is enabled then try vectorizing with both 256bit and 128bit - vectors. */ + vectors. If AVX512F is enabled then try vectorizing with 512bit, + 256bit and 128bit vectors. */ static unsigned int ix86_autovectorize_vector_sizes (void) { - return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; + return TARGET_AVX512F ? 64 | 32 | 16 : + (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; } @@ -43860,7 +46241,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, || (clonei->simdlen & (clonei->simdlen - 1)) != 0)) { warning_at (DECL_SOURCE_LOCATION (node->decl), 0, - "unsupported simdlen %d\n", clonei->simdlen); + "unsupported simdlen %d", clonei->simdlen); return 0; } @@ -44020,6 +46401,64 @@ ix86_simd_clone_usable (struct cgraph_node *node) } } +/* This function gives out the number of memory references. + This value determines the unrolling factor for + bdver3 and bdver4 architectures. */ + +static int +ix86_loop_memcount (rtx *x, unsigned *mem_count) +{ + if (*x != NULL_RTX && MEM_P (*x)) + { + enum machine_mode mode; + unsigned int n_words; + + mode = GET_MODE (*x); + n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; + + if (n_words > 4) + (*mem_count)+=2; + else + (*mem_count)+=1; + } + return 0; +} + +/* This function adjusts the unroll factor based on + the hardware capabilities. For ex, bdver3 has + a loop buffer which makes unrolling of smaller + loops less important. This function decides the + unroll factor using number of memory references + (value 32 is used) as a heuristic. */ + +static unsigned +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop) +{ + basic_block *bbs; + rtx insn; + unsigned i; + unsigned mem_count = 0; + + if (!TARGET_ADJUST_UNROLL) + return nunroll; + + /* Count the number of memory references within the loop body. */ + bbs = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn)) + if (NONDEBUG_INSN_P (insn)) + for_each_rtx (&insn, (rtx_function) ix86_loop_memcount, &mem_count); + } + free (bbs); + + if (mem_count && mem_count <=32) + return 32/mem_count; + + return nunroll; +} + + /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */ static bool @@ -44505,6 +46944,9 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) #define TARGET_INIT_LIBFUNCS darwin_rename_builtins #endif +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust + #undef TARGET_SPILL_CLASS #define TARGET_SPILL_CLASS ix86_spill_class diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 7efd1e01f4e..cdaab3684e1 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1,5 +1,5 @@ /* Definitions of target machine for GCC for IA-32. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -102,6 +102,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_CRC32_P(x) TARGET_ISA_CRC32_P(x) #define TARGET_AES TARGET_ISA_AES #define TARGET_AES_P(x) TARGET_ISA_AES_P(x) +#define TARGET_SHA TARGET_ISA_SHA +#define TARGET_SHA_P(x) TARGET_ISA_SHA_P(x) #define TARGET_PCLMUL TARGET_ISA_PCLMUL #define TARGET_PCLMUL_P(x) TARGET_ISA_PCLMUL_P(x) #define TARGET_CMPXCHG16B TARGET_ISA_CX16 @@ -247,10 +249,10 @@ extern const struct processor_costs ix86_size_cost; /* Macros used in the machine description to test the flags. */ -/* configure can arrange to make this 2, to force a 486. */ +/* configure can arrange to change it. */ #ifndef TARGET_CPU_DEFAULT -#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_generic +#define TARGET_CPU_DEFAULT PROCESSOR_GENERIC #endif #ifndef TARGET_FPMATH_DEFAULT @@ -301,9 +303,11 @@ extern const struct processor_costs ix86_size_cost; #define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON) #define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA) #define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2) -#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7) -#define TARGET_COREI7_AVX (ix86_tune == PROCESSOR_COREI7_AVX) +#define TARGET_NEHALEM (ix86_tune == PROCESSOR_NEHALEM) +#define TARGET_SANDYBRIDGE (ix86_tune == PROCESSOR_SANDYBRIDGE) #define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL) +#define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL) +#define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT) #define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC) #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) #define TARGET_BDVER1 (ix86_tune == PROCESSOR_BDVER1) @@ -312,8 +316,6 @@ extern const struct processor_costs ix86_size_cost; #define TARGET_BDVER4 (ix86_tune == PROCESSOR_BDVER4) #define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1) #define TARGET_BTVER2 (ix86_tune == PROCESSOR_BTVER2) -#define TARGET_ATOM (ix86_tune == PROCESSOR_ATOM) -#define TARGET_SLM (ix86_tune == PROCESSOR_SLM) /* Feature tests against the various tunings. */ enum ix86_tune_indices { @@ -443,6 +445,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE] #define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \ ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] +#define TARGET_ADJUST_UNROLL \ + ix86_tune_features[X86_TUNE_ADJUST_UNROLL] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { @@ -605,47 +609,6 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); /* Target Pragmas. */ #define REGISTER_TARGET_PRAGMAS() ix86_register_pragmas () -enum target_cpu_default -{ - TARGET_CPU_DEFAULT_generic = 0, - - TARGET_CPU_DEFAULT_i386, - TARGET_CPU_DEFAULT_i486, - TARGET_CPU_DEFAULT_pentium, - TARGET_CPU_DEFAULT_pentium_mmx, - TARGET_CPU_DEFAULT_pentiumpro, - TARGET_CPU_DEFAULT_pentium2, - TARGET_CPU_DEFAULT_pentium3, - TARGET_CPU_DEFAULT_pentium4, - TARGET_CPU_DEFAULT_pentium_m, - TARGET_CPU_DEFAULT_prescott, - TARGET_CPU_DEFAULT_nocona, - TARGET_CPU_DEFAULT_core2, - TARGET_CPU_DEFAULT_corei7, - TARGET_CPU_DEFAULT_corei7_avx, - TARGET_CPU_DEFAULT_haswell, - TARGET_CPU_DEFAULT_atom, - TARGET_CPU_DEFAULT_slm, - TARGET_CPU_DEFAULT_intel, - - TARGET_CPU_DEFAULT_geode, - TARGET_CPU_DEFAULT_k6, - TARGET_CPU_DEFAULT_k6_2, - TARGET_CPU_DEFAULT_k6_3, - TARGET_CPU_DEFAULT_athlon, - TARGET_CPU_DEFAULT_athlon_sse, - TARGET_CPU_DEFAULT_k8, - TARGET_CPU_DEFAULT_amdfam10, - TARGET_CPU_DEFAULT_bdver1, - TARGET_CPU_DEFAULT_bdver2, - TARGET_CPU_DEFAULT_bdver3, - TARGET_CPU_DEFAULT_bdver4, - TARGET_CPU_DEFAULT_btver1, - TARGET_CPU_DEFAULT_btver2, - - TARGET_CPU_DEFAULT_max -}; - #ifndef CC1_SPEC #define CC1_SPEC "%(cc1_cpu) " #endif @@ -2203,25 +2166,28 @@ do { \ with x86-64 medium memory model */ #define DEFAULT_LARGE_SECTION_THRESHOLD 65536 -/* Which processor to tune code generation for. */ +/* Which processor to tune code generation for. These must be in sync + with processor_target_table in i386.c. */ enum processor_type { - PROCESSOR_I386 = 0, /* 80386 */ + PROCESSOR_GENERIC = 0, + PROCESSOR_I386, /* 80386 */ PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */ PROCESSOR_PENTIUM, PROCESSOR_PENTIUMPRO, - PROCESSOR_GEODE, - PROCESSOR_K6, - PROCESSOR_ATHLON, PROCESSOR_PENTIUM4, - PROCESSOR_K8, PROCESSOR_NOCONA, PROCESSOR_CORE2, - PROCESSOR_COREI7, - PROCESSOR_COREI7_AVX, + PROCESSOR_NEHALEM, + PROCESSOR_SANDYBRIDGE, PROCESSOR_HASWELL, - PROCESSOR_GENERIC, + PROCESSOR_BONNELL, + PROCESSOR_SILVERMONT, + PROCESSOR_GEODE, + PROCESSOR_K6, + PROCESSOR_ATHLON, + PROCESSOR_K8, PROCESSOR_AMDFAM10, PROCESSOR_BDVER1, PROCESSOR_BDVER2, @@ -2229,8 +2195,6 @@ enum processor_type PROCESSOR_BDVER4, PROCESSOR_BTVER1, PROCESSOR_BTVER2, - PROCESSOR_ATOM, - PROCESSOR_SLM, PROCESSOR_max }; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ab5b33f6399..de0b2dd771b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1,5 +1,5 @@ ;; GCC machine description for IA-32 and x86-64. -;; Copyright (C) 1988-2013 Free Software Foundation, Inc. +;; Copyright (C) 1988-2014 Free Software Foundation, Inc. ;; Mostly by William Schelter. ;; x86_64 support added by Jan Hubicka ;; @@ -241,6 +241,16 @@ (ROUND_NO_EXC 0x8) ]) +;; Constants to represent AVX512F embeded rounding +(define_constants + [(ROUND_NEAREST_INT 0) + (ROUND_NEG_INF 1) + (ROUND_POS_INF 2) + (ROUND_ZERO 3) + (NO_ROUND 4) + (ROUND_SAE 5) + ]) + ;; Constants to represent pcomtrue/pcomfalse variants (define_constants [(PCOM_FALSE 0) @@ -354,7 +364,7 @@ ;; Processor type. -(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,corei7, +(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem, atom,slm,generic,amdfam10,bdver1,bdver2,bdver3,bdver4, btver1,btver2" (const (symbol_ref "ix86_schedule"))) @@ -904,6 +914,20 @@ (define_mode_iterator DWI [(DI "!TARGET_64BIT") (TI "TARGET_64BIT")]) +;; GET_MODE_SIZE for selected modes. As GET_MODE_SIZE is not +;; compile time constant, it is faster to use <MODE_SIZE> than +;; GET_MODE_SIZE (<MODE>mode). For XFmode which depends on +;; command line options just use GET_MODE_SIZE macro. +(define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8") (TI "16") + (SF "4") (DF "8") (XF "GET_MODE_SIZE (XFmode)") + (V16QI "16") (V32QI "32") (V64QI "64") + (V8HI "16") (V16HI "32") (V32HI "64") + (V4SI "16") (V8SI "32") (V16SI "64") + (V2DI "16") (V4DI "32") (V8DI "64") + (V1TI "16") (V2TI "32") (V4TI "64") + (V2DF "16") (V4DF "32") (V8DF "64") + (V4SF "16") (V8SF "32") (V16SF "64")]) + ;; Double word integer modes as mode attribute. (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI")]) (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti")]) @@ -2724,7 +2748,7 @@ "reload_completed" [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) (set (mem:SF (reg:P SP_REG)) (match_dup 1))] - "operands[2] = GEN_INT (-GET_MODE_SIZE (<P:MODE>mode));") + "operands[2] = GEN_INT (-<P:MODE_SIZE>);") (define_split [(set (match_operand:SF 0 "push_operand") @@ -5760,7 +5784,7 @@ enum machine_mode mode = <MODE>mode; rtx pat; - if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) + if (<MODE_SIZE> < GET_MODE_SIZE (SImode)) { mode = SImode; operands[0] = gen_lowpart (mode, operands[0]); @@ -17393,7 +17417,7 @@ [(parallel [(set (match_dup 0) (const_int -1)) (clobber (reg:CC FLAGS_REG))])] { - if (GET_MODE_SIZE (<MODE>mode) < GET_MODE_SIZE (SImode)) + if (<MODE_SIZE> < GET_MODE_SIZE (SImode)) operands[0] = gen_lowpart (SImode, operands[0]); }) diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 1704c526746..26cd8bb7b07 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1,6 +1,6 @@ ; Options for the IA-32 and AMD64 ports of the compiler. -; Copyright (C) 2005-2013 Free Software Foundation, Inc. +; Copyright (C) 2005-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; @@ -725,6 +725,10 @@ maes Target Report Mask(ISA_AES) Var(ix86_isa_flags) Save Support AES built-in functions and code generation +msha +Target Report Mask(ISA_SHA) Var(ix86_isa_flags) Save +Support SHA1 and SHA256 built-in functions and code generation + mpclmul Target Report Mask(ISA_PCLMUL) Var(ix86_isa_flags) Save Support PCLMUL built-in functions and code generation diff --git a/gcc/config/i386/i386elf.h b/gcc/config/i386/i386elf.h index e3adf203dd1..73e119dddd3 100644 --- a/gcc/config/i386/i386elf.h +++ b/gcc/config/i386/i386elf.h @@ -1,5 +1,5 @@ /* Target definitions for GCC for Intel 80386 using ELF - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. Derived from sysv4.h written by Ron Guilmette (rfg@netcom.com). diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h index 65642e46023..5e7c893fe85 100644 --- a/gcc/config/i386/ia32intrin.h +++ b/gcc/config/i386/ia32intrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 Free Software Foundation, Inc. +/* Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index e825c34a256..73b48599277 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Free Software Foundation, Inc. +/* Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -42,6 +42,16 @@ #include <avx2intrin.h> +#include <avx512fintrin.h> + +#include <avx512erintrin.h> + +#include <avx512pfintrin.h> + +#include <avx512cdintrin.h> + +#include <shaintrin.h> + #include <lzcntintrin.h> #include <bmiintrin.h> diff --git a/gcc/config/i386/interix.opt b/gcc/config/i386/interix.opt index af0c2381bca..a8c7230a26a 100644 --- a/gcc/config/i386/interix.opt +++ b/gcc/config/i386/interix.opt @@ -1,6 +1,6 @@ ; Interix-specific options. -; Copyright (C) 2005-2013 Free Software Foundation, Inc. +; Copyright (C) 2005-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/k6.md b/gcc/config/i386/k6.md index 226aa396d50..dadb39201fd 100644 --- a/gcc/config/i386/k6.md +++ b/gcc/config/i386/k6.md @@ -1,5 +1,5 @@ ;; AMD K6/K6-2 Scheduling -;; Copyright (C) 2002-2013 Free Software Foundation, Inc. +;; Copyright (C) 2002-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/kfreebsd-gnu.h b/gcc/config/i386/kfreebsd-gnu.h index 4891285a5c9..e487205a747 100644 --- a/gcc/config/i386/kfreebsd-gnu.h +++ b/gcc/config/i386/kfreebsd-gnu.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running kFreeBSD-based GNU systems with ELF format - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Robert Millan. This file is part of GCC. diff --git a/gcc/config/i386/kfreebsd-gnu64.h b/gcc/config/i386/kfreebsd-gnu64.h index c407139b152..1c75c8eb587 100644 --- a/gcc/config/i386/kfreebsd-gnu64.h +++ b/gcc/config/i386/kfreebsd-gnu64.h @@ -1,5 +1,5 @@ /* Definitions for AMD x86-64 running kFreeBSD-based GNU systems with ELF format - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Robert Millan. This file is part of GCC. diff --git a/gcc/config/i386/knetbsd-gnu.h b/gcc/config/i386/knetbsd-gnu.h index 515854bd084..23bf1292236 100644 --- a/gcc/config/i386/knetbsd-gnu.h +++ b/gcc/config/i386/knetbsd-gnu.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running kNetBSD-based GNU systems with ELF format - Copyright (C) 2004-2013 Free Software Foundation, Inc. + Copyright (C) 2004-2014 Free Software Foundation, Inc. Contributed by Robert Millan. This file is part of GCC. diff --git a/gcc/config/i386/kopensolaris-gnu.h b/gcc/config/i386/kopensolaris-gnu.h index 5b128f1f5f2..73ca5518a3b 100644 --- a/gcc/config/i386/kopensolaris-gnu.h +++ b/gcc/config/i386/kopensolaris-gnu.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running kOpenSolaris-based GNU systems with ELF format - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Robert Millan. This file is part of GCC. diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h index 52f0baf202e..1eaf024a6b5 100644 --- a/gcc/config/i386/linux-common.h +++ b/gcc/config/i386/linux-common.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running Linux-based GNU systems with ELF format. - Copyright (C) 2012-2013 Free Software Foundation, Inc. + Copyright (C) 2012-2014 Free Software Foundation, Inc. Contributed by Ilya Enkovich. This file is part of GCC. diff --git a/gcc/config/i386/linux.h b/gcc/config/i386/linux.h index 3c95ee00a45..1fb1e032177 100644 --- a/gcc/config/i386/linux.h +++ b/gcc/config/i386/linux.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running Linux-based GNU systems with ELF format. - Copyright (C) 1994-2013 Free Software Foundation, Inc. + Copyright (C) 1994-2014 Free Software Foundation, Inc. Contributed by Eric Youngdale. Modified for stabs-in-ELF by H.J. Lu. diff --git a/gcc/config/i386/linux64.h b/gcc/config/i386/linux64.h index b793e0826c9..a90171e8c54 100644 --- a/gcc/config/i386/linux64.h +++ b/gcc/config/i386/linux64.h @@ -1,5 +1,5 @@ /* Definitions for AMD x86-64 running Linux-based GNU systems with ELF format. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by Jan Hubicka <jh@suse.cz>, based on linux.h. This file is part of GCC. diff --git a/gcc/config/i386/lwpintrin.h b/gcc/config/i386/lwpintrin.h index 64ba7321fd9..1cd046a9965 100644 --- a/gcc/config/i386/lwpintrin.h +++ b/gcc/config/i386/lwpintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/lynx.h b/gcc/config/i386/lynx.h index bb48d96e66f..910930e71fd 100644 --- a/gcc/config/i386/lynx.h +++ b/gcc/config/i386/lynx.h @@ -1,5 +1,5 @@ /* Definitions for LynxOS on i386. - Copyright (C) 1993-2013 Free Software Foundation, Inc. + Copyright (C) 1993-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/lzcntintrin.h b/gcc/config/i386/lzcntintrin.h index 22b9ee7999e..b680a353910 100644 --- a/gcc/config/i386/lzcntintrin.h +++ b/gcc/config/i386/lzcntintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 Free Software Foundation, Inc. +/* Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mingw-pthread.h b/gcc/config/i386/mingw-pthread.h index d8fe4fc6748..99753cf0381 100644 --- a/gcc/config/i386/mingw-pthread.h +++ b/gcc/config/i386/mingw-pthread.h @@ -1,6 +1,6 @@ /* Defines that pthread library shall be enabled by default for target. - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mingw-stdint.h b/gcc/config/i386/mingw-stdint.h index 3d1bed4989b..1589d96bf6e 100644 --- a/gcc/config/i386/mingw-stdint.h +++ b/gcc/config/i386/mingw-stdint.h @@ -1,5 +1,5 @@ /* Definitions for <stdint.h> types on systems using mingw. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mingw-w64.h b/gcc/config/i386/mingw-w64.h index 633009baee4..b7436be0484 100644 --- a/gcc/config/i386/mingw-w64.h +++ b/gcc/config/i386/mingw-w64.h @@ -1,7 +1,7 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows 32/64 via mingw-w64 runtime, using GNU tools and the Windows API Library. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mingw-w64.opt b/gcc/config/i386/mingw-w64.opt index a54449e372f..90e01f3628b 100644 --- a/gcc/config/i386/mingw-w64.opt +++ b/gcc/config/i386/mingw-w64.opt @@ -1,6 +1,6 @@ ; MinGW-w64-specific options. -; Copyright (C) 2009-2013 Free Software Foundation, Inc. +; Copyright (C) 2009-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/mingw.opt b/gcc/config/i386/mingw.opt index 03419043d51..44fecb0cce9 100644 --- a/gcc/config/i386/mingw.opt +++ b/gcc/config/i386/mingw.opt @@ -1,6 +1,6 @@ ; MinGW-specific options. -; Copyright (C) 2008-2013 Free Software Foundation, Inc. +; Copyright (C) 2008-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h index 1ac55441725..f5638209504 100644 --- a/gcc/config/i386/mingw32.h +++ b/gcc/config/i386/mingw32.h @@ -1,6 +1,6 @@ /* Operating system specific defines to be used when targeting GCC for hosting on Windows32, using GNU tools and the Windows32 API Library. - Copyright (C) 1997-2013 Free Software Foundation, Inc. + Copyright (C) 1997-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mm3dnow.h b/gcc/config/i386/mm3dnow.h index 093d5e77932..bf847f939fb 100644 --- a/gcc/config/i386/mm3dnow.h +++ b/gcc/config/i386/mm3dnow.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2013 Free Software Foundation, Inc. +/* Copyright (C) 2004-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mmintrin.h b/gcc/config/i386/mmintrin.h index c0729709373..b351200e569 100644 --- a/gcc/config/i386/mmintrin.h +++ b/gcc/config/i386/mmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2013 Free Software Foundation, Inc. +/* Copyright (C) 2002-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index cc0db3a9d06..239e5db7105 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1,5 +1,5 @@ ;; GCC machine description for MMX and 3dNOW! instructions -;; Copyright (C) 2005-2013 Free Software Foundation, Inc. +;; Copyright (C) 2005-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/msformat-c.c b/gcc/config/i386/msformat-c.c index 34381050990..304d48f2090 100644 --- a/gcc/config/i386/msformat-c.c +++ b/gcc/config/i386/msformat-c.c @@ -1,5 +1,5 @@ /* Check calls to formatted I/O functions (-Wformat). - Copyright (C) 1992-2013 Free Software Foundation, Inc. + Copyright (C) 1992-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/netbsd-elf.h b/gcc/config/i386/netbsd-elf.h index 14e86328ba7..e575b39cbde 100644 --- a/gcc/config/i386/netbsd-elf.h +++ b/gcc/config/i386/netbsd-elf.h @@ -1,6 +1,6 @@ /* Definitions of target machine for GCC, for i386/ELF NetBSD systems. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by matthew green <mrg@eterna.com.au> This file is part of GCC. diff --git a/gcc/config/i386/netbsd64.h b/gcc/config/i386/netbsd64.h index 7882db6c4b8..f990835bd0f 100644 --- a/gcc/config/i386/netbsd64.h +++ b/gcc/config/i386/netbsd64.h @@ -1,6 +1,6 @@ /* Definitions of target machine for GCC, for x86-64/ELF NetBSD systems. - Copyright (C) 2002-2013 Free Software Foundation, Inc. + Copyright (C) 2002-2014 Free Software Foundation, Inc. Contributed by Wasabi Systems, Inc. This file is part of GCC. diff --git a/gcc/config/i386/nmmintrin.h b/gcc/config/i386/nmmintrin.h index aefe3ef9e90..9fc71073605 100644 --- a/gcc/config/i386/nmmintrin.h +++ b/gcc/config/i386/nmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/nto.h b/gcc/config/i386/nto.h index e17af0185b1..2abb9875151 100644 --- a/gcc/config/i386/nto.h +++ b/gcc/config/i386/nto.h @@ -1,5 +1,5 @@ /* Definitions for Intel 386 running QNX/Neutrino. - Copyright (C) 2002-2013 Free Software Foundation, Inc. + Copyright (C) 2002-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/nto.opt b/gcc/config/i386/nto.opt index f26964b8853..007894201c6 100644 --- a/gcc/config/i386/nto.opt +++ b/gcc/config/i386/nto.opt @@ -1,6 +1,6 @@ ; QNX options. -; Copyright (C) 2011-2013 Free Software Foundation, Inc. +; Copyright (C) 2011-2014 Free Software Foundation, Inc. ; ; This file is part of GCC. ; diff --git a/gcc/config/i386/openbsd.h b/gcc/config/i386/openbsd.h index 97debef06eb..f313d5cd1d5 100644 --- a/gcc/config/i386/openbsd.h +++ b/gcc/config/i386/openbsd.h @@ -1,5 +1,5 @@ /* Configuration for an OpenBSD i386 target. - Copyright (C) 1999-2013 Free Software Foundation, Inc. + Copyright (C) 1999-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/openbsdelf.h b/gcc/config/i386/openbsdelf.h index ab8fa549c34..46ae0b6cdad 100644 --- a/gcc/config/i386/openbsdelf.h +++ b/gcc/config/i386/openbsdelf.h @@ -1,6 +1,6 @@ /* Configuration for an OpenBSD i386 target. - Copyright (C) 2005-2013 Free Software Foundation, Inc. + Copyright (C) 2005-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/pentium.md b/gcc/config/i386/pentium.md index b9da7a2f95d..97fc55e2aa0 100644 --- a/gcc/config/i386/pentium.md +++ b/gcc/config/i386/pentium.md @@ -1,5 +1,5 @@ ;; Pentium Scheduling -;; Copyright (C) 2002-2013 Free Software Foundation, Inc. +;; Copyright (C) 2002-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h index d64cb38a2e8..3be2f3545d8 100644 --- a/gcc/config/i386/pmm_malloc.h +++ b/gcc/config/i386/pmm_malloc.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2013 Free Software Foundation, Inc. +/* Copyright (C) 2004-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/pmmintrin.h b/gcc/config/i386/pmmintrin.h index 2447d5aa31b..6a795005c8a 100644 --- a/gcc/config/i386/pmmintrin.h +++ b/gcc/config/i386/pmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Free Software Foundation, Inc. +/* Copyright (C) 2003-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/popcntintrin.h b/gcc/config/i386/popcntintrin.h index ee3a8e0d076..41845d86827 100644 --- a/gcc/config/i386/popcntintrin.h +++ b/gcc/config/i386/popcntintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 Free Software Foundation, Inc. +/* Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md index b53c2a7edac..25b2a546c69 100644 --- a/gcc/config/i386/ppro.md +++ b/gcc/config/i386/ppro.md @@ -1,5 +1,5 @@ ;; Scheduling for the Intel P6 family of processors -;; Copyright (C) 2004-2013 Free Software Foundation, Inc. +;; Copyright (C) 2004-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b86201924c3..9e6ea250401 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1,5 +1,5 @@ ;; Predicate definitions for IA-32 and x86-64. -;; Copyright (C) 2004-2013 Free Software Foundation, Inc. +;; Copyright (C) 2004-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/prfchwintrin.h b/gcc/config/i386/prfchwintrin.h index 73aa4cac7af..5c07c8606b5 100644 --- a/gcc/config/i386/prfchwintrin.h +++ b/gcc/config/i386/prfchwintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/rdos.h b/gcc/config/i386/rdos.h index b7242bbc445..e8370c6c63f 100644 --- a/gcc/config/i386/rdos.h +++ b/gcc/config/i386/rdos.h @@ -1,5 +1,5 @@ /* Definitions for RDOS on i386. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/rdos64.h b/gcc/config/i386/rdos64.h index 8522ad48c18..e6f089a008c 100644 --- a/gcc/config/i386/rdos64.h +++ b/gcc/config/i386/rdos64.h @@ -1,5 +1,5 @@ /* Definitions for RDOS on x86_64. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/rdseedintrin.h b/gcc/config/i386/rdseedintrin.h index 3d040ab3af6..0ab18e55296 100644 --- a/gcc/config/i386/rdseedintrin.h +++ b/gcc/config/i386/rdseedintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/rtemself.h b/gcc/config/i386/rtemself.h index 087179191cb..7c3a19ce6eb 100644 --- a/gcc/config/i386/rtemself.h +++ b/gcc/config/i386/rtemself.h @@ -1,5 +1,5 @@ /* Definitions for rtems targeting an ix86 using ELF. - Copyright (C) 1996-2013 Free Software Foundation, Inc. + Copyright (C) 1996-2014 Free Software Foundation, Inc. Contributed by Joel Sherrill (joel@OARcorp.com). This file is part of GCC. diff --git a/gcc/config/i386/rtmintrin.h b/gcc/config/i386/rtmintrin.h index eb2812fd82e..ac40d228a4c 100644 --- a/gcc/config/i386/rtmintrin.h +++ b/gcc/config/i386/rtmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/shaintrin.h b/gcc/config/i386/shaintrin.h new file mode 100644 index 00000000000..d8a3da3dafd --- /dev/null +++ b/gcc/config/i386/shaintrin.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <shaintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _SHAINTRIN_H_INCLUDED +#define _SHAINTRIN_H_INCLUDED + +#ifndef __SHA__ +#pragma GCC push_options +#pragma GCC target("sha") +#define __DISABLE_SHA__ +#endif /* __SHA__ */ + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1nexte_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I) +{ + return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I); +} +#else +#define _mm_sha1rnds4_epu32(A, B, I) \ + ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)A, \ + (__v4si)(__m128i)B, (int)I)) +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B, + (__v4si) __C); +} + +#ifdef __DISABLE_SHA__ +#undef __DISABLE_SHA__ +#pragma GCC pop_options +#endif /* __DISABLE_SHA__ */ + +#endif /* _SHAINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/slm.md b/gcc/config/i386/slm.md index 3ac919e372c..e3a8328c4e2 100644 --- a/gcc/config/i386/slm.md +++ b/gcc/config/i386/slm.md @@ -1,5 +1,5 @@ ;; Slivermont(SLM) Scheduling -;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. +;; Copyright (C) 2009-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h index 20fa2ca2f94..886ace43f3b 100644 --- a/gcc/config/i386/smmintrin.h +++ b/gcc/config/i386/smmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/sol2-bi.h b/gcc/config/i386/sol2-bi.h index 22c972b801f..66d17801f03 100644 --- a/gcc/config/i386/sol2-bi.h +++ b/gcc/config/i386/sol2-bi.h @@ -1,5 +1,5 @@ /* Definitions of target machine for GCC, for bi-arch Solaris 2/x86. - Copyright (C) 2004-2013 Free Software Foundation, Inc. + Copyright (C) 2004-2014 Free Software Foundation, Inc. Contributed by CodeSourcery, LLC. This file is part of GCC. diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h index 3bf86d14be7..8a21a59109d 100644 --- a/gcc/config/i386/sol2.h +++ b/gcc/config/i386/sol2.h @@ -1,5 +1,5 @@ /* Target definitions for GCC for Intel 80386 running Solaris 2 - Copyright (C) 1993-2013 Free Software Foundation, Inc. + Copyright (C) 1993-2014 Free Software Foundation, Inc. Contributed by Fred Fish (fnf@cygnus.com). This file is part of GCC. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 30895c67c09..405f9988d9b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005-2013 Free Software Foundation, Inc. +;; Copyright (C) 2005-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -119,6 +119,15 @@ UNSPEC_EXP2 UNSPEC_RCP28 UNSPEC_RSQRT28 + + ;; For SHA support + UNSPEC_SHA1MSG1 + UNSPEC_SHA1MSG2 + UNSPEC_SHA1NEXTE + UNSPEC_SHA1RNDS4 + UNSPEC_SHA256MSG1 + UNSPEC_SHA256MSG2 + UNSPEC_SHA256RNDS2 ]) (define_c_enum "unspecv" [ @@ -660,24 +669,26 @@ /* There is no evex-encoded vmov* for sizes smaller than 64-bytes in avx512f, so we need to use workarounds, to access sse registers 16-31, which are evex-only. */ - if (TARGET_AVX512F && GET_MODE_SIZE (<MODE>mode) < 64 - && (EXT_REX_SSE_REGNO_P (REGNO (operands[0])) - || EXT_REX_SSE_REGNO_P (REGNO (operands[1])))) + if (TARGET_AVX512F && <MODE_SIZE> < 64 + && ((REG_P (operands[0]) + && EXT_REX_SSE_REGNO_P (REGNO (operands[0]))) + || (REG_P (operands[1]) + && EXT_REX_SSE_REGNO_P (REGNO (operands[1]))))) { if (memory_operand (operands[0], <MODE>mode)) { - if (GET_MODE_SIZE (<MODE>mode) == 32) + if (<MODE_SIZE> == 32) return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}"; - else if (GET_MODE_SIZE (<MODE>mode) == 16) + else if (<MODE_SIZE> == 16) return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}"; else gcc_unreachable (); } else if (memory_operand (operands[1], <MODE>mode)) { - if (GET_MODE_SIZE (<MODE>mode) == 32) + if (<MODE_SIZE> == 32) return "vbroadcast<shuffletype>64x4\t{%1, %g0|%g0, %1}"; - else if (GET_MODE_SIZE (<MODE>mode) == 16) + else if (<MODE_SIZE> == 16) return "vbroadcast<shuffletype>32x4\t{%1, %g0|%g0, %1}"; else gcc_unreachable (); @@ -748,8 +759,9 @@ (set (attr "mode") (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "<ssePSmode>") - (and (eq_attr "alternative" "2") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (and (match_test "<MODE_SIZE> == 16") + (and (eq_attr "alternative" "2") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<sseinsnmode>") @@ -912,7 +924,28 @@ DONE; }) -(define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" +(define_expand "<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "nonimmediate_operand")] + UNSPEC_LOADU))] + "TARGET_SSE && <mask_mode512bit_condition>" +{ + /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads + just fine if misaligned_operand is true, and without the UNSPEC it can + be combined with arithmetic instructions. If misaligned_operand is + false, still emit UNSPEC_LOADU insn to honor user's request for + misaligned load. */ + if (TARGET_AVX + && misaligned_operand (operands[1], <MODE>mode) + /* FIXME: Revisit after AVX512F merge is completed. */ + && !<mask_applied>) + { + emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF [(match_operand:VF 1 "nonimmediate_operand" "vm")] @@ -965,8 +998,9 @@ (set_attr "ssememalign" "8") (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (cond [(and (match_test "<MODE_SIZE> == 16") + (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<MODE>") @@ -999,7 +1033,29 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<sse2_avx_avx512f>_loaddqu<mode><mask_name>" +(define_expand "<sse2_avx_avx512f>_loaddqu<mode><mask_name>" + [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand") + (unspec:VI_UNALIGNED_LOADSTORE + [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand")] + UNSPEC_LOADU))] + "TARGET_SSE2 && <mask_mode512bit_condition>" +{ + /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads + just fine if misaligned_operand is true, and without the UNSPEC it can + be combined with arithmetic instructions. If misaligned_operand is + false, still emit UNSPEC_LOADU insn to honor user's request for + misaligned load. */ + if (TARGET_AVX + && misaligned_operand (operands[1], <MODE>mode) + /* FIXME: Revisit after AVX512F merge is completed. */ + && !<mask_applied>) + { + emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*<sse2_avx_avx512f>_loaddqu<mode><mask_name>" [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand" "=v") (unspec:VI_UNALIGNED_LOADSTORE [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")] @@ -1048,6 +1104,7 @@ { switch (get_attr_mode (insn)) { + case MODE_V16SF: case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; @@ -1070,8 +1127,9 @@ (const_string "1"))) (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "TARGET_SSE_TYPELESS_STORES")) + (cond [(and (match_test "<MODE_SIZE> == 16") + (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<sseinsnmode>") @@ -1229,83 +1287,83 @@ } [(set_attr "isa" "noavx,noavx,avx,avx")]) -(define_expand "<plusminus_insn><mode>3<mask_name>" +(define_expand "<plusminus_insn><mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand") (plusminus:VF - (match_operand:VF 1 "nonimmediate_operand") - (match_operand:VF 2 "nonimmediate_operand")))] - "TARGET_SSE && <mask_mode512bit_condition>" + (match_operand:VF 1 "<round_nimm_predicate>") + (match_operand:VF 2 "<round_nimm_predicate>")))] + "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") -(define_insn "*<plusminus_insn><mode>3<mask_name>" +(define_insn "*<plusminus_insn><mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (plusminus:VF - (match_operand:VF 1 "nonimmediate_operand" "<comm>0,v") - (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] - "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition>" + (match_operand:VF 1 "<round_nimm_predicate>" "<comm>0,v") + (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))] + "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ <plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2} - v<plusminus_mnemonic><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + v<plusminus_mnemonic><ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") (set_attr "prefix" "<mask_prefix3>") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vm<plusminus_insn><mode>3" +(define_insn "<sse>_vm<plusminus_insn><mode>3<round_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (plusminus:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" "@ <plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} - v<plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" + v<plusminus_mnemonic><ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %<iptr>2<round_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "<round_prefix>") (set_attr "mode" "<ssescalarmode>")]) -(define_expand "mul<mode>3<mask_name>" +(define_expand "mul<mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand") (mult:VF - (match_operand:VF 1 "nonimmediate_operand") - (match_operand:VF 2 "nonimmediate_operand")))] - "TARGET_SSE && <mask_mode512bit_condition>" + (match_operand:VF 1 "<round_nimm_predicate>") + (match_operand:VF 2 "<round_nimm_predicate>")))] + "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") -(define_insn "*mul<mode>3<mask_name>" +(define_insn "*mul<mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (mult:VF - (match_operand:VF 1 "nonimmediate_operand" "%0,v") - (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] - "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition>" + (match_operand:VF 1 "<round_nimm_predicate>" "%0,v") + (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))] + "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ mul<ssemodesuffix>\t{%2, %0|%0, %2} - vmul<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + vmul<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "ssemul") (set_attr "prefix" "<mask_prefix3>") (set_attr "btver2_decode" "direct,double") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vm<multdiv_mnemonic><mode>3" +(define_insn "<sse>_vm<multdiv_mnemonic><mode>3<round_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (multdiv:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" "@ <multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} - v<multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" + v<multdiv_mnemonic><ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %<iptr>2<round_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse<multdiv_mnemonic>") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "<round_prefix>") (set_attr "btver2_decode" "direct,double") (set_attr "mode" "<ssescalarmode>")]) @@ -1335,15 +1393,15 @@ } }) -(define_insn "<sse>_div<mode>3<mask_name>" +(define_insn "<sse>_div<mode>3<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (div:VF (match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] - "TARGET_SSE && <mask_mode512bit_condition>" + (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))] + "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ div<ssemodesuffix>\t{%2, %0|%0, %2} - vdiv<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + vdiv<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "ssediv") (set_attr "prefix" "<mask_prefix3>") @@ -1391,7 +1449,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "*srcp14<mode>" +(define_insn "srcp14<mode>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -1401,7 +1459,7 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vrcp14<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "vrcp14<ssescalarmodesuffix>\t{%2, %1, %0|, %1, %2}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -1427,32 +1485,32 @@ } }) -(define_insn "<sse>_sqrt<mode>2<mask_name>" +(define_insn "<sse>_sqrt<mode>2<mask_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=v") - (sqrt:VF (match_operand:VF 1 "nonimmediate_operand" "vm")))] - "TARGET_SSE && <mask_mode512bit_condition>" - "%vsqrt<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + (sqrt:VF (match_operand:VF 1 "<round_nimm_predicate>" "<round_constraint>")))] + "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" + "%vsqrt<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "sse") (set_attr "atom_sse_attr" "sqrt") (set_attr "btver2_sse_attr" "sqrt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vmsqrt<mode>2" +(define_insn "<sse>_vmsqrt<mode>2<round_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (sqrt:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "xm,vm")) + (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_constraint>")) (match_operand:VF_128 2 "register_operand" "0,v") (const_int 1)))] "TARGET_SSE" "@ sqrt<ssescalarmodesuffix>\t{%1, %0|%0, %<iptr>1} - vsqrt<ssescalarmodesuffix>\t{%1, %2, %0|%0, %2, %<iptr>1}" + vsqrt<ssescalarmodesuffix>\t{<round_op3>%1, %2, %0|%0, %2, %<iptr>1<round_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse") (set_attr "atom_sse_attr" "sqrt") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "<round_prefix>") (set_attr "btver2_sse_attr" "sqrt") (set_attr "mode" "<ssescalarmode>")]) @@ -1487,7 +1545,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "*rsqrt14<mode>" +(define_insn "rsqrt14<mode>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -1523,67 +1581,67 @@ ;; isn't really correct, as those rtl operators aren't defined when ;; applied to NaNs. Hopefully the optimizers won't get too smart on us. -(define_expand "<code><mode>3<mask_name>" +(define_expand "<code><mode>3<mask_name><round_saeonly_name>" [(set (match_operand:VF 0 "register_operand") (smaxmin:VF - (match_operand:VF 1 "nonimmediate_operand") - (match_operand:VF 2 "nonimmediate_operand")))] - "TARGET_SSE && <mask_mode512bit_condition>" + (match_operand:VF 1 "<round_saeonly_nimm_predicate>") + (match_operand:VF 2 "<round_saeonly_nimm_predicate>")))] + "TARGET_SSE && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>" { if (!flag_finite_math_only) operands[1] = force_reg (<MODE>mode, operands[1]); ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); }) -(define_insn "*<code><mode>3_finite<mask_name>" +(define_insn "*<code><mode>3_finite<mask_name><round_saeonly_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (smaxmin:VF - (match_operand:VF 1 "nonimmediate_operand" "%0,v") - (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] + (match_operand:VF 1 "<round_saeonly_nimm_predicate>" "%0,v") + (match_operand:VF 2 "<round_saeonly_nimm_predicate>" "xm,<round_saeonly_constraint>")))] "TARGET_SSE && flag_finite_math_only && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) - && <mask_mode512bit_condition>" + && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>" "@ <maxmin_float><ssemodesuffix>\t{%2, %0|%0, %2} - v<maxmin_float><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + v<maxmin_float><ssemodesuffix>\t{<round_saeonly_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_saeonly_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") (set_attr "btver2_sse_attr" "maxmin") (set_attr "prefix" "<mask_prefix3>") (set_attr "mode" "<MODE>")]) -(define_insn "*<code><mode>3<mask_name>" +(define_insn "*<code><mode>3<mask_name><round_saeonly_name>" [(set (match_operand:VF 0 "register_operand" "=x,v") (smaxmin:VF (match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] + (match_operand:VF 2 "<round_saeonly_nimm_predicate>" "xm,<round_saeonly_constraint>")))] "TARGET_SSE && !flag_finite_math_only - && <mask_mode512bit_condition>" + && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>" "@ <maxmin_float><ssemodesuffix>\t{%2, %0|%0, %2} - v<maxmin_float><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + v<maxmin_float><ssemodesuffix>\t{<round_saeonly_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_saeonly_mask_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") (set_attr "btver2_sse_attr" "maxmin") (set_attr "prefix" "<mask_prefix3>") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vm<code><mode>3" +(define_insn "<sse>_vm<code><mode>3<round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (smaxmin:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_saeonly_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" "@ <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} - v<maxmin_float><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" + v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %<iptr>2<round_saeonly_op3>}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse") (set_attr "btver2_sse_attr" "maxmin") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "<round_saeonly_prefix>") (set_attr "mode" "<ssescalarmode>")]) ;; These versions of the min/max patterns implement exactly the operations @@ -2099,21 +2157,21 @@ [(V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand") (V16SI "const_0_to_7_operand") (V8DI "const_0_to_7_operand")]) -(define_insn "avx512f_cmp<mode>3" +(define_insn "avx512f_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48F_512 1 "register_operand" "v") - (match_operand:VI48F_512 2 "nonimmediate_operand" "vm") + (match_operand:VI48F_512 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "<cmp_imm_predicate>" "n")] UNSPEC_PCMP))] - "TARGET_AVX512F" - "v<sseintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "TARGET_AVX512F && <round_saeonly_mode512bit_condition>" + "v<sseintprefix>cmp<ssemodesuffix>\t{%3, <round_saeonly_mask_scalar_merge_op4>%2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2<round_saeonly_mask_scalar_merge_op4>, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_ucmp<mode>3" +(define_insn "avx512f_ucmp<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand" "v") @@ -2121,41 +2179,41 @@ (match_operand:SI 3 "const_0_to_7_operand" "n")] UNSPEC_UNSIGNED_PCMP))] "TARGET_AVX512F" - "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_vmcmp<mode>3" +(define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (const_int 1)))] "TARGET_AVX512F" - "vcmp<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "vcmp<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "avx512f_vmcmp<mode>3_mask" +(define_insn "avx512f_vmcmp<mode>3_mask<round_saeonly_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (and:<avx512fmaskmode> (match_operand:<avx512fmaskmode> 4 "register_operand" "k") (const_int 1))))] "TARGET_AVX512F" - "vcmp<ssescalarmodesuffix>\t{%3, %2, %1, %0%{%4%}|%0%{%4%}, %1, %2, %3}" + "vcmp<ssescalarmodesuffix>\t{%3, <round_saeonly_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_saeonly_op5>, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -2173,17 +2231,17 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<sse>_comi" +(define_insn "<sse>_comi<round_saeonly_name>" [(set (reg:CCFP FLAGS_REG) (compare:CCFP (vec_select:MODEF (match_operand:<ssevecmode> 0 "register_operand" "v") (parallel [(const_int 0)])) (vec_select:MODEF - (match_operand:<ssevecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "SSE_FLOAT_MODE_P (<MODE>mode)" - "%vcomi<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" + "%vcomi<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}" [(set_attr "type" "ssecomi") (set_attr "prefix" "maybe_vex") (set_attr "prefix_rep" "0") @@ -2193,17 +2251,17 @@ (const_string "0"))) (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_ucomi" +(define_insn "<sse>_ucomi<round_saeonly_name>" [(set (reg:CCFPU FLAGS_REG) (compare:CCFPU (vec_select:MODEF (match_operand:<ssevecmode> 0 "register_operand" "v") (parallel [(const_int 0)])) (vec_select:MODEF - (match_operand:<ssevecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "SSE_FLOAT_MODE_P (<MODE>mode)" - "%vucomi<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" + "%vucomi<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}" [(set_attr "type" "ssecomi") (set_attr "prefix" "maybe_vex") (set_attr "prefix_rep" "0") @@ -2305,7 +2363,7 @@ } /* There is no vandnp[sd]. Use vpandnq. */ - if (GET_MODE_SIZE (<MODE>mode) == 64) + if (<MODE_SIZE> == 64) { suffix = "q"; ops = "vpandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}"; @@ -2377,7 +2435,7 @@ } /* There is no v<logic>p[sd]. Use vp<logic>q. */ - if (GET_MODE_SIZE (<MODE>mode) == 64) + if (<MODE_SIZE> == 64) { suffix = "q"; ops = "vp<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}"; @@ -2698,210 +2756,224 @@ (match_operand:FMAMODE 3 "nonimmediate_operand")))] "") -(define_insn "*fma_fmadd_<mode>" +(define_expand "avx512f_fmadd_<mode>_maskz<round_expand_name>" + [(match_operand:VF_512 0 "register_operand") + (match_operand:VF_512 1 "<round_expand_nimm_predicate>") + (match_operand:VF_512 2 "<round_expand_nimm_predicate>") + (match_operand:VF_512 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskmode> 4 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_fma_fmadd_<mode>_maskz_1<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_insn "<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name><round_name>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") - (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x")))] - "" - "@ - vfmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x") + (match_operand:FMAMODE 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") + (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")))] + "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>" + "@ + vfmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfmadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfmadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmadd_<mode>_mask" +(define_insn "avx512f_fmadd_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (fma:VF_512 (match_operand:VF_512 1 "register_operand" "0,0") - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm")) + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfmadd132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfmadd213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfmadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfmadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmadd_<mode>_mask3" +(define_insn "avx512f_fmadd_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=x") (vec_merge:VF_512 (fma:VF_512 (match_operand:VF_512 1 "register_operand" "x") - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_512 3 "register_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfmadd231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfmadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fmsub_<mode>" +(define_insn "<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name><round_name>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") + (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0, 0, v, x,x") + (match_operand:FMAMODE 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x"))))] - "" + (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))))] + "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ - vfmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + vfmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfmsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfmsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmsub_<mode>_mask" +(define_insn "avx512f_fmsub_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (fma:VF_512 (match_operand:VF_512 1 "register_operand" "0,0") - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_512 - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm"))) + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfmsub132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfmsub213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfmsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfmsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmsub_<mode>_mask3" +(define_insn "avx512f_fmsub_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (fma:VF_512 (match_operand:VF_512 1 "register_operand" "v") - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_512 (match_operand:VF_512 3 "register_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfmsub231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfmsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fnmadd_<mode>" +(define_insn "<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name><round_name>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") - (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x")))] - "" - "@ - vfnmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfnmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfnmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x")) + (match_operand:FMAMODE 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") + (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")))] + "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>" + "@ + vfnmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfnmadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfnmadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fnmadd_<mode>_mask" +(define_insn "avx512f_fnmadd_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (fma:VF_512 (neg:VF_512 (match_operand:VF_512 1 "register_operand" "0,0")) - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm")) + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfnmadd132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfnmadd213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfnmadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfnmadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fnmadd_<mode>_mask3" +(define_insn "avx512f_fnmadd_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (fma:VF_512 (neg:VF_512 (match_operand:VF_512 1 "register_operand" "v")) - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_512 3 "register_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfnmadd231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfnmadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fnmsub_<mode>" +(define_insn "<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name><round_name>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") + (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x")) + (match_operand:FMAMODE 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x"))))] - "" + (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))))] + "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ - vfnmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfnmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfnmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + vfnmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfnmsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfnmsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fnmsub_<mode>_mask" +(define_insn "avx512f_fnmsub_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (fma:VF_512 (neg:VF_512 (match_operand:VF_512 1 "register_operand" "0,0")) - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_512 - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm"))) + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfnmsub132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfnmsub213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfnmsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfnmsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fnmsub_<mode>_mask3" +(define_insn "avx512f_fnmsub_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (fma:VF_512 (neg:VF_512 (match_operand:VF_512 1 "register_operand" "v")) - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_512 (match_operand:VF_512 3 "register_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfnmsub231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfnmsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) @@ -2926,109 +2998,123 @@ UNSPEC_FMADDSUB))] "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F") -(define_insn "*fma_fmaddsub_<mode>" +(define_expand "avx512f_fmaddsub_<mode>_maskz<round_expand_name>" + [(match_operand:VF_512 0 "register_operand") + (match_operand:VF_512 1 "<round_expand_nimm_predicate>") + (match_operand:VF_512 2 "<round_expand_nimm_predicate>") + (match_operand:VF_512 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskmode> 4 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_fma_fmaddsub_<mode>_maskz_1<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_insn "<sd_mask_codefor>fma_fmaddsub_<mode><sd_maskz_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=v,v,v,x,x") (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%0, 0, v, x,x") - (match_operand:VF 2 "nonimmediate_operand" "vm, v,vm, x,m") - (match_operand:VF 3 "nonimmediate_operand" " v,vm, 0,xm,x")] + [(match_operand:VF 1 "<round_nimm_predicate>" "%0,0,v,x,x") + (match_operand:VF 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") + (match_operand:VF 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")] UNSPEC_FMADDSUB))] - "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F) && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ - vfmaddsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfmaddsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfmaddsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + vfmaddsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfmaddsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfmaddsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmaddsub_<mode>_mask" +(define_insn "avx512f_fmaddsub_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "0,0") - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm")] + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")] UNSPEC_FMADDSUB) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfmaddsub132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfmaddsub213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfmaddsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfmaddsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmaddsub_<mode>_mask3" +(define_insn "avx512f_fmaddsub_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "v") - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_512 3 "register_operand" "0")] UNSPEC_FMADDSUB) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfmaddsub231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfmaddsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fmsubadd_<mode>" +(define_insn "<sd_mask_codefor>fma_fmsubadd_<mode><sd_maskz_name><round_name>" [(set (match_operand:VF 0 "register_operand" "=v,v,v,x,x") (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%0, 0, v, x,x") - (match_operand:VF 2 "nonimmediate_operand" "vm, v,vm, x,m") + [(match_operand:VF 1 "<round_nimm_predicate>" "%0,0,v,x,x") + (match_operand:VF 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m") (neg:VF - (match_operand:VF 3 "nonimmediate_operand" " v,vm, 0,xm,x"))] + (match_operand:VF 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))] UNSPEC_FMADDSUB))] - "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F) && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>" "@ - vfmsubadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} - vfmsubadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vfmsubadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + vfmsubadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>} + vfmsubadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>} + vfmsubadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>} vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmsubadd_<mode>_mask" +(define_insn "avx512f_fmsubadd_<mode>_mask<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v,v") (vec_merge:VF_512 (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "0,0") - (match_operand:VF_512 2 "nonimmediate_operand" "vm,v") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_512 - (match_operand:VF_512 3 "nonimmediate_operand" "v,vm"))] + (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))] UNSPEC_FMADDSUB) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "k,k")))] "TARGET_AVX512F" "@ - vfmsubadd132<ssemodesuffix>\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, %2} - vfmsubadd213<ssemodesuffix>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + vfmsubadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>} + vfmsubadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}" [(set_attr "isa" "fma_avx512f,fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fmsubadd_<mode>_mask3" +(define_insn "avx512f_fmsubadd_<mode>_mask3<round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "v") - (match_operand:VF_512 2 "nonimmediate_operand" "vm") + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_512 (match_operand:VF_512 3 "register_operand" "0"))] UNSPEC_FMADDSUB) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "k")))] "TARGET_AVX512F" - "vfmsubadd231<ssemodesuffix>\t{%2, %1, %0%{%4%}|%0%{%4%}, %1, %2}" + "vfmsubadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" [(set_attr "isa" "fma_avx512f") (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) @@ -3036,13 +3122,13 @@ ;; FMA3 floating point scalar intrinsics. These merge result with ;; high-order elements from the destination register. -(define_expand "fmai_vmfmadd_<mode>" +(define_expand "fmai_vmfmadd_<mode><round_name>" [(set (match_operand:VF_128 0 "register_operand") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand") - (match_operand:VF_128 2 "nonimmediate_operand") - (match_operand:VF_128 3 "nonimmediate_operand")) + (match_operand:VF_128 1 "<round_nimm_predicate>") + (match_operand:VF_128 2 "<round_nimm_predicate>") + (match_operand:VF_128 3 "<round_nimm_predicate>")) (match_dup 1) (const_int 1)))] "TARGET_FMA") @@ -3051,15 +3137,15 @@ [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") - (match_operand:VF_128 2 "nonimmediate_operand" "vm, v") - (match_operand:VF_128 3 "nonimmediate_operand" " v,vm")) + (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0") + (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v") + (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" "@ - vfmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} - vfmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" + vfmadd132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>} + vfmadd213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}" [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) @@ -3067,51 +3153,51 @@ [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") - (match_operand:VF_128 2 "nonimmediate_operand" "vm, v") + (match_operand:VF_128 1 "<round_nimm_predicate>" "0,0") + (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_128 - (match_operand:VF_128 3 "nonimmediate_operand" " v,vm"))) + (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" "@ - vfmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} - vfmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" + vfmsub132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>} + vfmsub213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}" [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fmai_fnmadd_<mode>" +(define_insn "*fmai_fnmadd_<mode><round_name>" [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "nonimmediate_operand" "vm, v")) - (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") - (match_operand:VF_128 3 "nonimmediate_operand" " v,vm")) + (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "<round_nimm_predicate>" "0,0") + (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" "@ - vfnmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} - vfnmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" + vfnmadd132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>} + vfnmadd213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}" [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fmai_fnmsub_<mode>" +(define_insn "*fmai_fnmsub_<mode><round_name>" [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "nonimmediate_operand" "vm, v")) - (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") + (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v")) + (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0") (neg:VF_128 - (match_operand:VF_128 3 "nonimmediate_operand" " v,vm"))) + (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" "@ - vfnmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} - vfnmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" + vfnmsub132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>} + vfnmsub213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}" [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) @@ -3232,18 +3318,18 @@ (set_attr "prefix_rep" "0") (set_attr "mode" "SF")]) -(define_insn "sse_cvtsi2ss" +(define_insn "sse_cvtsi2ss<round_name>" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF (vec_duplicate:V4SF - (float:SF (match_operand:SI 2 "nonimmediate_operand" "r,m,rm"))) + (float:SF (match_operand:SI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>"))) (match_operand:V4SF 1 "register_operand" "0,0,v") (const_int 1)))] "TARGET_SSE" "@ cvtsi2ss\t{%2, %0|%0, %2} cvtsi2ss\t{%2, %0|%0, %2} - vcvtsi2ss\t{%2, %1, %0|%0, %1, %2}" + vcvtsi2ss\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double,*") @@ -3253,18 +3339,18 @@ (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "SF")]) -(define_insn "sse_cvtsi2ssq" +(define_insn "sse_cvtsi2ssq<round_name>" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF (vec_duplicate:V4SF - (float:SF (match_operand:DI 2 "nonimmediate_operand" "r,m,rm"))) + (float:SF (match_operand:DI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>"))) (match_operand:V4SF 1 "register_operand" "0,0,v") (const_int 1)))] "TARGET_SSE && TARGET_64BIT" "@ cvtsi2ssq\t{%2, %0|%0, %2} cvtsi2ssq\t{%2, %0|%0, %2} - vcvtsi2ssq\t{%2, %1, %0|%0, %1, %2}" + vcvtsi2ssq\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double,*") @@ -3276,15 +3362,15 @@ (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "SF")]) -(define_insn "sse_cvtss2si" +(define_insn "sse_cvtss2si<round_name>" [(set (match_operand:SI 0 "register_operand" "=r,r") (unspec:SI [(vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "v,m") + (match_operand:V4SF 1 "<round_nimm_predicate>" "v,<round_constraint2>") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE" - "%vcvtss2si\t{%1, %0|%0, %k1}" + "%vcvtss2si\t{<round_op2>%1, %0|%0, %k1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "bdver1_decode" "double,double") @@ -3306,15 +3392,15 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) -(define_insn "sse_cvtss2siq" +(define_insn "sse_cvtss2siq<round_name>" [(set (match_operand:DI 0 "register_operand" "=r,r") (unspec:DI [(vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "v,m") + (match_operand:V4SF 1 "<round_nimm_predicate>" "v,<round_constraint2>") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE && TARGET_64BIT" - "%vcvtss2si{q}\t{%1, %0|%0, %k1}" + "%vcvtss2si{q}\t{<round_op2>%1, %0|%0, %k1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "bdver1_decode" "double,double") @@ -3336,14 +3422,14 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "sse_cvttss2si" +(define_insn "sse_cvttss2si<round_saeonly_name>" [(set (match_operand:SI 0 "register_operand" "=r,r") (fix:SI (vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "v,m") + (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>") (parallel [(const_int 0)]))))] "TARGET_SSE" - "%vcvttss2si\t{%1, %0|%0, %k1}" + "%vcvttss2si\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") @@ -3352,14 +3438,14 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) -(define_insn "sse_cvttss2siq" +(define_insn "sse_cvttss2siq<round_saeonly_name>" [(set (match_operand:DI 0 "register_operand" "=r,r") (fix:DI (vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "v,vm") + (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "TARGET_SSE && TARGET_64BIT" - "%vcvttss2si{q}\t{%1, %0|%0, %k1}" + "%vcvttss2si{q}\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") @@ -3368,50 +3454,50 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "cvtusi2<ssescalarmodesuffix>32" +(define_insn "cvtusi2<ssescalarmodesuffix>32<round_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (vec_duplicate:VF_128 (unsigned_float:<ssescalarmode> - (match_operand:SI 2 "nonimmediate_operand" "rm"))) + (match_operand:SI 2 "<round_nimm_predicate>" "<round_constraint3>"))) (match_operand:VF_128 1 "register_operand" "v") (const_int 1)))] - "TARGET_AVX512F" - "vcvtusi2<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "TARGET_AVX512F && <round_modev4sf_condition>" + "vcvtusi2<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "cvtusi2<ssescalarmodesuffix>64" +(define_insn "cvtusi2<ssescalarmodesuffix>64<round_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (vec_duplicate:VF_128 (unsigned_float:<ssescalarmode> - (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (match_operand:DI 2 "<round_nimm_predicate>" "<round_constraint3>"))) (match_operand:VF_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F && TARGET_64BIT" - "vcvtusi2<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "vcvtusi2<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "float<sseintvecmodelower><mode>2<mask_name>" +(define_insn "float<sseintvecmodelower><mode>2<mask_name><round_name>" [(set (match_operand:VF1 0 "register_operand" "=v") (float:VF1 - (match_operand:<sseintvecmode> 1 "nonimmediate_operand" "vm")))] - "TARGET_SSE2 && <mask_mode512bit_condition>" - "%vcvtdq2ps\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + (match_operand:<sseintvecmode> 1 "<round_nimm_predicate>" "<round_constraint>")))] + "TARGET_SSE2 && <mask_mode512bit_condition> && <round_mode512bit_condition>" + "%vcvtdq2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "ufloatv16siv16sf2<mask_name>" +(define_insn "ufloatv16siv16sf2<mask_name><round_name>" [(set (match_operand:V16SF 0 "register_operand" "=v") (unsigned_float:V16SF - (match_operand:V16SI 1 "nonimmediate_operand" "vm")))] + (match_operand:V16SI 1 "<round_nimm_predicate>" "<round_constraint>")))] "TARGET_AVX512F" - "vcvtudq2ps\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtudq2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "V16SF")]) @@ -3421,7 +3507,11 @@ (match_operand:<sseintvecmode> 1 "register_operand")] "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)" { - ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]); + if (<MODE>mode == V16SFmode) + emit_insn (gen_ufloatv16siv16sf2 (operands[0], operands[1])); + else + ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]); + DONE; }) @@ -3446,34 +3536,34 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<mask_codefor>avx512f_fix_notruncv16sfv16si<mask_name>" +(define_insn "<mask_codefor>avx512f_fix_notruncv16sfv16si<mask_name><round_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (unspec:V16SI - [(match_operand:V16SF 1 "nonimmediate_operand" "vm")] + [(match_operand:V16SF 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtps2dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtps2dq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "<mask_codefor>avx512f_ufix_notruncv16sfv16si<mask_name>" +(define_insn "<mask_codefor>avx512f_ufix_notruncv16sfv16si<mask_name><round_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (unspec:V16SI - [(match_operand:V16SF 1 "nonimmediate_operand" "vm")] + [(match_operand:V16SF 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtps2udq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtps2udq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "<fixsuffix>fix_truncv16sfv16si2<mask_name>" +(define_insn "<fixsuffix>fix_truncv16sfv16si2<mask_name><round_saeonly_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (any_fix:V16SI - (match_operand:V16SF 1 "nonimmediate_operand" "vm")))] + (match_operand:V16SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))] "TARGET_AVX512F" - "vcvttps2<fixsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvttps2<fixsuffix>dq\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "XI")]) @@ -3512,11 +3602,17 @@ (match_operand:VF1 1 "register_operand")] "TARGET_SSE2" { - rtx tmp[3]; - tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); - tmp[1] = gen_reg_rtx (<sseintvecmode>mode); - emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0])); - emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2])); + if (<MODE>mode == V16SFmode) + emit_insn (gen_ufix_truncv16sfv16si2 (operands[0], + operands[1])); + else + { + rtx tmp[3]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = gen_reg_rtx (<sseintvecmode>mode); + emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0])); + emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2])); + } DONE; }) @@ -3581,18 +3677,18 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "DF")]) -(define_insn "sse2_cvtsi2sdq" +(define_insn "sse2_cvtsi2sdq<round_name>" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") (vec_merge:V2DF (vec_duplicate:V2DF - (float:DF (match_operand:DI 2 "nonimmediate_operand" "r,m,rm"))) + (float:DF (match_operand:DI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>"))) (match_operand:V2DF 1 "register_operand" "0,0,v") (const_int 1)))] "TARGET_SSE2 && TARGET_64BIT" "@ cvtsi2sdq\t{%2, %0|%0, %2} cvtsi2sdq\t{%2, %0|%0, %2} - vcvtsi2sdq\t{%2, %1, %0|%0, %1, %2}" + vcvtsi2sdq\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,direct,*") @@ -3603,115 +3699,115 @@ (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "DF")]) -(define_insn "avx512f_vcvtss2usi" +(define_insn "avx512f_vcvtss2usi<round_name>" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "vm") + (match_operand:V4SF 1 "<round_nimm_predicate>" "<round_constraint>") (parallel [(const_int 0)]))] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtss2usi\t{%1, %0|%0, %1}" + "vcvtss2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "SI")]) -(define_insn "avx512f_vcvtss2usiq" +(define_insn "avx512f_vcvtss2usiq<round_name>" [(set (match_operand:DI 0 "register_operand" "=r") (unspec:DI [(vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "vm") + (match_operand:V4SF 1 "<round_nimm_predicate>" "<round_constraint>") (parallel [(const_int 0)]))] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F && TARGET_64BIT" - "vcvtss2usi\t{%1, %0|%0, %1}" + "vcvtss2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "DI")]) -(define_insn "avx512f_vcvttss2usi" +(define_insn "avx512f_vcvttss2usi<round_saeonly_name>" [(set (match_operand:SI 0 "register_operand" "=r") (unsigned_fix:SI (vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "vm") + (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "TARGET_AVX512F" - "vcvttss2usi\t{%1, %0|%0, %1}" + "vcvttss2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "SI")]) -(define_insn "avx512f_vcvttss2usiq" +(define_insn "avx512f_vcvttss2usiq<round_saeonly_name>" [(set (match_operand:DI 0 "register_operand" "=r") (unsigned_fix:DI (vec_select:SF - (match_operand:V4SF 1 "nonimmediate_operand" "vm") + (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "TARGET_AVX512F && TARGET_64BIT" - "vcvttss2usi\t{%1, %0|%0, %1}" + "vcvttss2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "DI")]) -(define_insn "avx512f_vcvtsd2usi" +(define_insn "avx512f_vcvtsd2usi<round_name>" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "vm") + (match_operand:V2DF 1 "<round_nimm_predicate>" "<round_constraint>") (parallel [(const_int 0)]))] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtsd2usi\t{%1, %0|%0, %1}" + "vcvtsd2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "SI")]) -(define_insn "avx512f_vcvtsd2usiq" +(define_insn "avx512f_vcvtsd2usiq<round_name>" [(set (match_operand:DI 0 "register_operand" "=r") (unspec:DI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "vm") + (match_operand:V2DF 1 "<round_nimm_predicate>" "<round_constraint>") (parallel [(const_int 0)]))] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F && TARGET_64BIT" - "vcvtsd2usi\t{%1, %0|%0, %1}" + "vcvtsd2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "DI")]) -(define_insn "avx512f_vcvttsd2usi" +(define_insn "avx512f_vcvttsd2usi<round_saeonly_name>" [(set (match_operand:SI 0 "register_operand" "=r") (unsigned_fix:SI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "vm") + (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "TARGET_AVX512F" - "vcvttsd2usi\t{%1, %0|%0, %1}" + "vcvttsd2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "SI")]) -(define_insn "avx512f_vcvttsd2usiq" +(define_insn "avx512f_vcvttsd2usiq<round_saeonly_name>" [(set (match_operand:DI 0 "register_operand" "=r") (unsigned_fix:DI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "vm") + (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] "TARGET_AVX512F && TARGET_64BIT" - "vcvttsd2usi\t{%1, %0|%0, %1}" + "vcvttsd2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "DI")]) -(define_insn "sse2_cvtsd2si" +(define_insn "sse2_cvtsd2si<round_name>" [(set (match_operand:SI 0 "register_operand" "=r,r") (unspec:SI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "v,m") + (match_operand:V2DF 1 "<round_nimm_predicate>" "v,<round_constraint2>") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2" - "%vcvtsd2si\t{%1, %0|%0, %q1}" + "%vcvtsd2si\t{<round_op2>%1, %0|%0, %q1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "bdver1_decode" "double,double") @@ -3734,15 +3830,15 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) -(define_insn "sse2_cvtsd2siq" +(define_insn "sse2_cvtsd2siq<round_name>" [(set (match_operand:DI 0 "register_operand" "=r,r") (unspec:DI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "v,m") + (match_operand:V2DF 1 "<round_nimm_predicate>" "v,<round_constraint2>") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2 && TARGET_64BIT" - "%vcvtsd2si{q}\t{%1, %0|%0, %q1}" + "%vcvtsd2si{q}\t{<round_op2>%1, %0|%0, %q1<round_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "bdver1_decode" "double,double") @@ -3764,14 +3860,14 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "sse2_cvttsd2si" +(define_insn "sse2_cvttsd2si<round_saeonly_name>" [(set (match_operand:SI 0 "register_operand" "=r,r") (fix:SI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "v,m") + (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>") (parallel [(const_int 0)]))))] "TARGET_SSE2" - "%vcvttsd2si\t{%1, %0|%0, %q1}" + "%vcvttsd2si\t{<round_saeonly_op2>%1, %0|%0, %q1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") @@ -3781,14 +3877,14 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) -(define_insn "sse2_cvttsd2siq" +(define_insn "sse2_cvttsd2siq<round_saeonly_name>" [(set (match_operand:DI 0 "register_operand" "=r,r") (fix:DI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "v,m") + (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>") (parallel [(const_int 0)]))))] "TARGET_SSE2 && TARGET_64BIT" - "%vcvttsd2si{q}\t{%1, %0|%0, %q1}" + "%vcvttsd2si{q}\t{<round_saeonly_op2>%1, %0|%0, %q1<round_saeonly_op2>}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") @@ -3863,13 +3959,13 @@ (set_attr "ssememalign" "64") (set_attr "mode" "V2DF")]) -(define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name>" +(define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name><round_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (unspec:V8SI - [(match_operand:V8DF 1 "nonimmediate_operand" "vm")] + [(match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtpd2dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtpd2dq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "OI")]) @@ -3937,23 +4033,23 @@ (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double")]) -(define_insn "avx512f_ufix_notruncv8dfv8si<mask_name>" +(define_insn "avx512f_ufix_notruncv8dfv8si<mask_name><round_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (unspec:V8SI - [(match_operand:V8DF 1 "nonimmediate_operand" "vm")] + [(match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_UNSIGNED_FIX_NOTRUNC))] "TARGET_AVX512F" - "vcvtpd2udq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtpd2udq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "OI")]) -(define_insn "<fixsuffix>fix_truncv8dfv8si2<mask_name>" +(define_insn "<fixsuffix>fix_truncv8dfv8si2<mask_name><round_saeonly_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_fix:V8SI - (match_operand:V8DF 1 "nonimmediate_operand" "vm")))] + (match_operand:V8DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))] "TARGET_AVX512F" - "vcvttpd2<fixsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvttpd2<fixsuffix>dq\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "OI")]) @@ -4014,34 +4110,34 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) -(define_insn "sse2_cvtsd2ss" +(define_insn "sse2_cvtsd2ss<round_name>" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF (vec_duplicate:V4SF (float_truncate:V2SF - (match_operand:V2DF 2 "nonimmediate_operand" "x,m,vm"))) + (match_operand:V2DF 2 "nonimmediate_operand" "x,m,<round_constraint>"))) (match_operand:V4SF 1 "register_operand" "0,0,v") (const_int 1)))] "TARGET_SSE2" "@ cvtsd2ss\t{%2, %0|%0, %2} cvtsd2ss\t{%2, %0|%0, %q2} - vcvtsd2ss\t{%2, %1, %0|%0, %1, %q2}" + vcvtsd2ss\t{<round_op3>%2, %1, %0|%0, %1, %q2<round_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double,*") (set_attr "amdfam10_decode" "vector,double,*") (set_attr "bdver1_decode" "direct,direct,*") (set_attr "btver2_decode" "double,double,double") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,<round_prefix>") (set_attr "mode" "SF")]) -(define_insn "sse2_cvtss2sd" +(define_insn "sse2_cvtss2sd<round_saeonly_name>" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") (vec_merge:V2DF (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "nonimmediate_operand" "x,m,vm") + (match_operand:V4SF 2 "nonimmediate_operand" "x,m,<round_saeonly_constraint>") (parallel [(const_int 0) (const_int 1)]))) (match_operand:V2DF 1 "register_operand" "0,0,v") (const_int 1)))] @@ -4049,22 +4145,22 @@ "@ cvtss2sd\t{%2, %0|%0, %2} cvtss2sd\t{%2, %0|%0, %k2} - vcvtss2sd\t{%2, %1, %0|%0, %1, %k2}" + vcvtss2sd\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %k2<round_saeonly_op3>}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "amdfam10_decode" "vector,double,*") (set_attr "athlon_decode" "direct,direct,*") (set_attr "bdver1_decode" "direct,direct,*") (set_attr "btver2_decode" "double,double,double") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,<round_saeonly_prefix>") (set_attr "mode" "DF")]) -(define_insn "<mask_codefor>avx512f_cvtpd2ps512<mask_name>" +(define_insn "<mask_codefor>avx512f_cvtpd2ps512<mask_name><round_name>" [(set (match_operand:V8SF 0 "register_operand" "=v") (float_truncate:V8SF - (match_operand:V8DF 1 "nonimmediate_operand" "vm")))] + (match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")))] "TARGET_AVX512F" - "vcvtpd2ps\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtpd2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "V8SF")]) @@ -4114,12 +4210,12 @@ (define_mode_attr sf2dfmode [(V8DF "V8SF") (V4DF "V4SF")]) -(define_insn "<sse2_avx_avx512f>_cvtps2pd<avxsizesuffix><mask_name>" +(define_insn "<sse2_avx_avx512f>_cvtps2pd<avxsizesuffix><mask_name><round_saeonly_name>" [(set (match_operand:VF2_512_256 0 "register_operand" "=v") (float_extend:VF2_512_256 - (match_operand:<sf2dfmode> 1 "nonimmediate_operand" "vm")))] - "TARGET_AVX && <mask_mode512bit_condition>" - "vcvtps2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + (match_operand:<sf2dfmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))] + "TARGET_AVX && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>" + "vcvtps2pd\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) @@ -4443,6 +4539,32 @@ DONE; }) +(define_expand "vec_unpacku_float_hi_v16si" + [(match_operand:V8DF 0 "register_operand") + (match_operand:V16SI 1 "register_operand")] + "TARGET_AVX512F" +{ + REAL_VALUE_TYPE TWO32r; + rtx k, x, tmp[4]; + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + + tmp[0] = force_reg (V8DFmode, CONST0_RTX (V8DFmode)); + tmp[1] = force_reg (V8DFmode, ix86_build_const_vector (V8DFmode, 1, x)); + tmp[2] = gen_reg_rtx (V8DFmode); + tmp[3] = gen_reg_rtx (V8SImode); + k = gen_reg_rtx (QImode); + + emit_insn (gen_vec_extract_hi_v16si (tmp[3], operands[1])); + emit_insn (gen_floatv8siv8df2 (tmp[2], tmp[3])); + emit_insn (gen_rtx_SET (VOIDmode, k, + gen_rtx_LT (QImode, tmp[2], tmp[0]))); + emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k)); + emit_move_insn (operands[0], tmp[2]); + DONE; +}) + (define_expand "vec_unpacku_float_lo_v8si" [(match_operand:V4DF 0 "register_operand") (match_operand:V8SI 1 "nonimmediate_operand")] @@ -4608,31 +4730,46 @@ (define_expand "vec_pack_ufix_trunc_<mode>" [(match_operand:<ssepackfltmode> 0 "register_operand") - (match_operand:VF2_128_256 1 "register_operand") - (match_operand:VF2_128_256 2 "register_operand")] + (match_operand:VF2 1 "register_operand") + (match_operand:VF2 2 "register_operand")] "TARGET_SSE2" { - rtx tmp[7]; - tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); - tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]); - tmp[4] = gen_reg_rtx (<ssepackfltmode>mode); - emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1])); - if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2) + if (<MODE>mode == V8DFmode) { - tmp[5] = gen_reg_rtx (<ssepackfltmode>mode); - ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0); + rtx r1, r2; + + r1 = gen_reg_rtx (V8SImode); + r2 = gen_reg_rtx (V8SImode); + + emit_insn (gen_ufix_truncv8dfv8si2 (r1, operands[1])); + emit_insn (gen_ufix_truncv8dfv8si2 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2)); } else { - tmp[5] = gen_reg_rtx (V8SFmode); - ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]), - gen_lowpart (V8SFmode, tmp[3]), 0); - tmp[5] = gen_lowpart (V8SImode, tmp[5]); + rtx tmp[7]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]); + tmp[4] = gen_reg_rtx (<ssepackfltmode>mode); + emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1])); + if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2) + { + tmp[5] = gen_reg_rtx (<ssepackfltmode>mode); + ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0); + } + else + { + tmp[5] = gen_reg_rtx (V8SFmode); + ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]), + gen_lowpart (V8SFmode, tmp[3]), 0); + tmp[5] = gen_lowpart (V8SImode, tmp[5]); + } + tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5], + operands[0], 0, OPTAB_DIRECT); + if (tmp[6] != operands[0]) + emit_move_insn (operands[0], tmp[6]); } - tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5], - operands[0], 0, OPTAB_DIRECT); - if (tmp[6] != operands[0]) - emit_move_insn (operands[0], tmp[6]); + DONE; }) @@ -6418,32 +6555,47 @@ operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8); }) -(define_insn "*avx512f_vmscalef<mode>" +(define_insn "avx512f_vmscalef<mode><round_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + (match_operand:VF_128 2 "nonimmediate_operand" "<round_constraint>")] UNSPEC_SCALEF) (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "%vscalef<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "%vscalef<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}" [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "avx512f_scalef<mode><mask_name>" +(define_insn "avx512f_scalef<mode><mask_name><round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "v") - (match_operand:VF_512 2 "nonimmediate_operand" "vm")] + (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_SCALEF))] "TARGET_AVX512F" - "%vscalef<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + "%vscalef<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_vternlog<mode>" +(define_expand "avx512f_vternlog<mode>_maskz" + [(match_operand:VI48_512 0 "register_operand") + (match_operand:VI48_512 1 "register_operand") + (match_operand:VI48_512 2 "register_operand") + (match_operand:VI48_512 3 "nonimmediate_operand") + (match_operand:SI 4 "const_0_to_255_operand") + (match_operand:<avx512fmaskmode> 5 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_vternlog<mode>_maskz_1 ( + operands[0], operands[1], operands[2], operands[3], + operands[4], CONST0_RTX (<MODE>mode), operands[5])); + DONE; +}) + +(define_insn "avx512f_vternlog<mode><sd_maskz_name>" [(set (match_operand:VI48_512 0 "register_operand" "=v") (unspec:VI48_512 [(match_operand:VI48_512 1 "register_operand" "0") @@ -6452,7 +6604,7 @@ (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] "TARGET_AVX512F" - "vpternlog<ssemodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" + "vpternlog<ssemodesuffix>\t{%4, %3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3, %4}" [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -6474,26 +6626,26 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_getexp<mode><mask_name>" +(define_insn "avx512f_getexp<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") - (unspec:VF_512 [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + (unspec:VF_512 [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] UNSPEC_GETEXP))] "TARGET_AVX512F" - "vgetexp<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; + "vgetexp<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"; [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_sgetexp<mode>" +(define_insn "avx512f_sgetexp<mode><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>")] UNSPEC_GETEXP) (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vgetexp<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"; + "vgetexp<ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %2<round_saeonly_op3>}"; [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) @@ -6539,59 +6691,92 @@ DONE; }) -(define_insn "avx512f_fixupimm<mode>" + +(define_expand "avx512f_fixupimm<mode>_maskz<round_saeonly_expand_name>" + [(match_operand:VF_512 0 "register_operand") + (match_operand:VF_512 1 "register_operand") + (match_operand:VF_512 2 "register_operand") + (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>") + (match_operand:SI 4 "const_0_to_255_operand") + (match_operand:<avx512fmaskmode> 5 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_fixupimm<mode>_maskz_1<round_saeonly_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4], CONST0_RTX (<MODE>mode), operands[5] + <round_saeonly_expand_operand6>)); + DONE; +}) + +(define_insn "avx512f_fixupimm<mode><sd_maskz_name><round_saeonly_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "0") (match_operand:VF_512 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM))] "TARGET_AVX512F" - "vfixupimm<ssemodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3<round_saeonly_sd_mask_op5>, %4}"; [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_fixupimm<mode>_mask" +(define_insn "avx512f_fixupimm<mode>_mask<round_saeonly_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_merge:VF_512 (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "0") (match_operand:VF_512 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) (match_dup 1) (match_operand:<avx512fmaskmode> 5 "register_operand" "k")))] "TARGET_AVX512F" - "vfixupimm<ssemodesuffix>\t{%4, %3, %2, %0%{%5%}|%0%{%5%}, %2, %3, %4}"; + "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %3<round_saeonly_op6>, %4}"; [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_sfixupimm<mode>" +(define_expand "avx512f_sfixupimm<mode>_maskz<round_saeonly_expand_name>" + [(match_operand:VF_128 0 "register_operand") + (match_operand:VF_128 1 "register_operand") + (match_operand:VF_128 2 "register_operand") + (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>") + (match_operand:SI 4 "const_0_to_255_operand") + (match_operand:<avx512fmaskmode> 5 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_sfixupimm<mode>_maskz_1<round_saeonly_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + operands[4], CONST0_RTX (<MODE>mode), operands[5] + <round_saeonly_expand_operand6>)); + DONE; +}) + +(define_insn "avx512f_sfixupimm<mode><sd_maskz_name><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "0") (match_operand:VF_128 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vfixupimm<ssescalarmodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3<round_saeonly_sd_mask_op5>, %4}"; [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "avx512f_sfixupimm<mode>_mask" +(define_insn "avx512f_sfixupimm<mode>_mask<round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "0") (match_operand:VF_128 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) (match_dup 1) @@ -6599,34 +6784,34 @@ (match_dup 1) (match_operand:<avx512fmaskmode> 5 "register_operand" "k")))] "TARGET_AVX512F" - "vfixupimm<ssescalarmodesuffix>\t{%4, %3, %2, %0%{%5%}|%0%{%5%}, %2, %3, %4}"; + "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %3<round_saeonly_op6>, %4}"; [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) -(define_insn "avx512f_rndscale<mode><mask_name>" +(define_insn "avx512f_rndscale<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 - [(match_operand:VF_512 1 "nonimmediate_operand" "vm") + [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_255_operand")] UNSPEC_ROUND))] "TARGET_AVX512F" - "vrndscale<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + "vrndscale<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}" [(set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "*avx512f_rndscale<mode>" +(define_insn "avx512f_rndscale<mode><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_ROUND) (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vrndscale<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}" [(set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -8127,22 +8312,22 @@ [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_expand "<code><mode>3<mask_name>" +(define_expand "<code><mode>3<mask_name><round_name>" [(set (match_operand:VI124_256_48_512 0 "register_operand") (maxmin:VI124_256_48_512 - (match_operand:VI124_256_48_512 1 "nonimmediate_operand") - (match_operand:VI124_256_48_512 2 "nonimmediate_operand")))] - "TARGET_AVX2 && <mask_mode512bit_condition>" + (match_operand:VI124_256_48_512 1 "<round_nimm_predicate>") + (match_operand:VI124_256_48_512 2 "<round_nimm_predicate>")))] + "TARGET_AVX2 && <mask_mode512bit_condition> && <round_mode512bit_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") -(define_insn "*avx2_<code><mode>3<mask_name>" +(define_insn "*avx2_<code><mode>3<mask_name><round_name>" [(set (match_operand:VI124_256_48_512 0 "register_operand" "=v") (maxmin:VI124_256_48_512 - (match_operand:VI124_256_48_512 1 "nonimmediate_operand" "%v") - (match_operand:VI124_256_48_512 2 "nonimmediate_operand" "vm")))] + (match_operand:VI124_256_48_512 1 "<round_nimm_predicate>" "%v") + (match_operand:VI124_256_48_512 2 "<round_nimm_predicate>" "<round_constraint>")))] "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) - && <mask_mode512bit_condition>" - "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + && <mask_mode512bit_condition> && <round_mode512bit_condition>" + "vp<maxmin_int><ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" [(set_attr "type" "sseiadd") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_evex") @@ -8360,7 +8545,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_expand "avx512f_eq<mode>3" +(define_expand "avx512f_eq<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand") @@ -8369,14 +8554,14 @@ "TARGET_AVX512F" "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);") -(define_insn "avx512f_eq<mode>3_1" +(define_insn "avx512f_eq<mode>3<mask_scalar_merge_name>_1" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand" "%v") (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_EQ))] "TARGET_AVX512F && ix86_binary_operator_ok (EQ, <MODE>mode, operands)" - "vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "vpcmpeq<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "type" "ssecmp") (set_attr "prefix_extra" "1") (set_attr "prefix" "evex") @@ -8456,13 +8641,13 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn "avx512f_gt<mode>3" +(define_insn "avx512f_gt<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand" "v") (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] "TARGET_AVX512F" - "vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "vpcmpgt<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "type" "ssecmp") (set_attr "prefix_extra" "1") (set_attr "prefix" "evex") @@ -8755,7 +8940,7 @@ (const_string "<sseinsnmode>") (match_test "TARGET_AVX") (if_then_else - (match_test "GET_MODE_SIZE (<MODE>mode) > 16") + (match_test "<MODE_SIZE> > 16") (const_string "V8SF") (const_string "<sseinsnmode>")) (ior (not (match_test "TARGET_SSE2")) @@ -8847,7 +9032,7 @@ (const_string "<sseinsnmode>") (match_test "TARGET_AVX") (if_then_else - (match_test "GET_MODE_SIZE (<MODE>mode) > 16") + (match_test "<MODE_SIZE> > 16") (const_string "V8SF") (const_string "<sseinsnmode>")) (ior (not (match_test "TARGET_SSE2")) @@ -8856,25 +9041,25 @@ ] (const_string "<sseinsnmode>")))]) -(define_insn "avx512f_testm<mode>3" +(define_insn "avx512f_testm<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand" "v") (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_TESTM))] "TARGET_AVX512F" - "vptestm<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_testnm<mode>3" +(define_insn "avx512f_testnm<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_512 1 "register_operand" "v") (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_TESTNM))] "TARGET_AVX512CD" - "%vptestnm<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + "%vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -12440,33 +12625,33 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "avx512er_exp2<mode><mask_name>" +(define_insn "avx512er_exp2<mode><mask_name><round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 - [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_EXP2))] "TARGET_AVX512ER" - "vexp2<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vexp2<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "<mask_codefor>avx512er_rcp28<mode><mask_name>" +(define_insn "<mask_codefor>avx512er_rcp28<mode><mask_name><round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 - [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_RCP28))] "TARGET_AVX512ER" - "vrcp28<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vrcp28<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "<mask_codefor>avx512er_rsqrt28<mode><mask_name>" +(define_insn "<mask_codefor>avx512er_rsqrt28<mode><mask_name><round_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 - [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_RSQRT28))] "TARGET_AVX512ER" - "vrsqrt28<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vrsqrt28<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -13892,7 +14077,21 @@ (set_attr "prefix" "<mask_prefix>") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_vpermi2var<mode>3" +(define_expand "avx512f_vpermi2var<mode>3_maskz" + [(match_operand:VI48F_512 0 "register_operand" "=v") + (match_operand:VI48F_512 1 "register_operand" "v") + (match_operand:<sseintvecmode> 2 "register_operand" "0") + (match_operand:VI48F_512 3 "nonimmediate_operand" "vm") + (match_operand:<avx512fmaskmode> 4 "register_operand" "k")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_vpermi2var<mode>3_maskz_1 ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4])); + DONE; +}) + +(define_insn "avx512f_vpermi2var<mode>3<sd_maskz_name>" [(set (match_operand:VI48F_512 0 "register_operand" "=v") (unspec:VI48F_512 [(match_operand:VI48F_512 1 "register_operand" "v") @@ -13900,7 +14099,7 @@ (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")] UNSPEC_VPERMI2))] "TARGET_AVX512F" - "vpermi2<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}" + "vpermi2<ssemodesuffix>\t{%3, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %3}" [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -13921,7 +14120,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_vpermt2var<mode>3" +(define_expand "avx512f_vpermt2var<mode>3_maskz" + [(match_operand:VI48F_512 0 "register_operand" "=v") + (match_operand:<sseintvecmode> 1 "register_operand" "v") + (match_operand:VI48F_512 2 "register_operand" "0") + (match_operand:VI48F_512 3 "nonimmediate_operand" "vm") + (match_operand:<avx512fmaskmode> 4 "register_operand" "k")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_vpermt2var<mode>3_maskz_1 ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4])); + DONE; +}) + +(define_insn "avx512f_vpermt2var<mode>3<sd_maskz_name>" [(set (match_operand:VI48F_512 0 "register_operand" "=v") (unspec:VI48F_512 [(match_operand:<sseintvecmode> 1 "register_operand" "v") @@ -13929,7 +14142,7 @@ (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")] UNSPEC_VPERMT2))] "TARGET_AVX512F" - "vpermt2<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}" + "vpermt2<ssemodesuffix>\t{%3, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %3}" [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -14457,13 +14670,13 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "V8SF")]) -(define_insn "<mask_codefor>avx512f_vcvtph2ps512<mask_name>" +(define_insn "<mask_codefor>avx512f_vcvtph2ps512<mask_name><round_saeonly_name>" [(set (match_operand:V16SF 0 "register_operand" "=v") (unspec:V16SF - [(match_operand:V16HI 1 "nonimmediate_operand" "vm")] + [(match_operand:V16HI 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] UNSPEC_VCVTPH2PS))] "TARGET_AVX512F" - "vcvtph2ps\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtph2ps\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "V16SF")]) @@ -14938,6 +15151,16 @@ (set_attr "memory" "store") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "avx512f_expand<mode>_maskz" + [(set (match_operand:VI48F_512 0 "register_operand") + (unspec:VI48F_512 + [(match_operand:VI48F_512 1 "nonimmediate_operand") + (match_operand:VI48F_512 2 "vector_move_operand") + (match_operand:<avx512fmaskmode> 3 "register_operand")] + UNSPEC_EXPAND))] + "TARGET_AVX512F" + "operands[2] = CONST0_RTX (<MODE>mode);") + (define_insn "avx512f_expand<mode>_mask" [(set (match_operand:VI48F_512 0 "register_operand" "=v,v") (unspec:VI48F_512 @@ -14952,29 +15175,29 @@ (set_attr "memory" "none,load") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx512f_getmant<mode><mask_name>" +(define_insn "avx512f_getmant<mode><mask_name><round_saeonly_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") (unspec:VF_512 - [(match_operand:VF_512 1 "nonimmediate_operand" "vm") + [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_15_operand")] UNSPEC_GETMANT))] "TARGET_AVX512F" - "vgetmant<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"; + "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"; [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_getmant<mode>" +(define_insn "avx512f_getmant<mode><round_saeonly_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_15_operand")] UNSPEC_GETMANT) (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vgetmant<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}"; [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) @@ -14998,3 +15221,84 @@ [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "sha1msg1" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")] + UNSPEC_SHA1MSG1))] + "TARGET_SHA" + "sha1msg1\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI")]) + +(define_insn "sha1msg2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")] + UNSPEC_SHA1MSG2))] + "TARGET_SHA" + "sha1msg2\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI")]) + +(define_insn "sha1nexte" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")] + UNSPEC_SHA1NEXTE))] + "TARGET_SHA" + "sha1nexte\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI")]) + +(define_insn "sha1rnds4" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_3_operand" "n")] + UNSPEC_SHA1RNDS4))] + "TARGET_SHA" + "sha1rnds4\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "sha256msg1" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")] + UNSPEC_SHA256MSG1))] + "TARGET_SHA" + "sha256msg1\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI")]) + +(define_insn "sha256msg2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")] + UNSPEC_SHA256MSG2))] + "TARGET_SHA" + "sha256msg2\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI")]) + +(define_insn "sha256rnds2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI + [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (match_operand:V4SI 3 "register_operand" "Yz")] + UNSPEC_SHA256RNDS2))] + "TARGET_SHA" + "sha256rnds2\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) diff --git a/gcc/config/i386/ssemath.h b/gcc/config/i386/ssemath.h index 83abfddfeb7..ec8d74a6277 100644 --- a/gcc/config/i386/ssemath.h +++ b/gcc/config/i386/ssemath.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 Free Software Foundation, Inc. +/* Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/stringop.def b/gcc/config/i386/stringop.def index 1a7d1e88f65..279aa1961d9 100644 --- a/gcc/config/i386/stringop.def +++ b/gcc/config/i386/stringop.def @@ -1,5 +1,5 @@ /* Definitions for stringop strategy for IA-32. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/stringop.opt b/gcc/config/i386/stringop.opt index 5c5fc906a33..bb8d2d2b782 100644 --- a/gcc/config/i386/stringop.opt +++ b/gcc/config/i386/stringop.opt @@ -1,5 +1,5 @@ /* Definitions for stringop option handling for IA-32. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 6b45d058f22..7fd39487f96 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -1,5 +1,5 @@ ;; GCC machine description for AVX512F instructions -;; Copyright (C) 2013 Free Software Foundation, Inc. +;; Copyright (C) 2013-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -27,6 +27,19 @@ V16SF V8SF V4SF V8DF V4DF V2DF]) +(define_mode_iterator SUBST_S + [QI HI SI DI]) + +(define_mode_iterator SUBST_A + [V16QI + V16HI V8HI + V16SI V8SI V4SI + V8DI V4DI V2DI + V16SF V8SF V4SF + V8DF V4DF V2DF + QI HI SI DI SF DF + CCFP CCFPU]) + (define_subst_attr "mask_name" "mask" "" "_mask") (define_subst_attr "mask_applied" "mask" "false" "true") (define_subst_attr "mask_operand2" "mask" "" "%{%3%}%N2") @@ -38,7 +51,7 @@ (define_subst_attr "mask_operand18" "mask" "" "%{%19%}%N18") (define_subst_attr "mask_operand19" "mask" "" "%{%20%}%N19") (define_subst_attr "mask_codefor" "mask" "*" "") -(define_subst_attr "mask_mode512bit_condition" "mask" "1" "(GET_MODE_SIZE (GET_MODE (operands[0])) == 64)") +(define_subst_attr "mask_mode512bit_condition" "mask" "1" "(<MODE_SIZE> == 64)") (define_subst_attr "store_mask_constraint" "mask" "vm" "v") (define_subst_attr "store_mask_predicate" "mask" "nonimmediate_operand" "register_operand") (define_subst_attr "mask_prefix" "mask" "vex" "evex") @@ -54,3 +67,136 @@ (match_dup 1) (match_operand:SUBST_V 2 "vector_move_operand" "0C") (match_operand:<avx512fmaskmode> 3 "register_operand" "k")))]) + +(define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask") +(define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}") +(define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}") + +(define_subst "mask_scalar_merge" + [(set (match_operand:SUBST_S 0) + (match_operand:SUBST_S 1))] + "TARGET_AVX512F" + [(set (match_dup 0) + (and:SUBST_S + (match_dup 1) + (match_operand:SUBST_S 3 "register_operand" "k")))]) + +(define_subst_attr "sd_maskz_name" "sd" "" "_maskz_1") +(define_subst_attr "sd_mask_op4" "sd" "" "%{%5%}%N4") +(define_subst_attr "sd_mask_op5" "sd" "" "%{%6%}%N5") +(define_subst_attr "sd_mask_codefor" "sd" "*" "") +(define_subst_attr "sd_mask_mode512bit_condition" "sd" "1" "(<MODE_SIZE> == 64)") + +(define_subst "sd" + [(set (match_operand:SUBST_V 0) + (match_operand:SUBST_V 1))] + "" + [(set (match_dup 0) + (vec_merge:SUBST_V + (match_dup 1) + (match_operand:SUBST_V 2 "const0_operand" "C") + (match_operand:<avx512fmaskmode> 3 "register_operand" "k"))) +]) + +(define_subst_attr "round_name" "round" "" "_round") +(define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4") +(define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5") +(define_subst_attr "round_sd_mask_operand4" "sd" "%R4" "%R6") +(define_subst_attr "round_op2" "round" "" "%R2") +(define_subst_attr "round_op3" "round" "" "%R3") +(define_subst_attr "round_op4" "round" "" "%R4") +(define_subst_attr "round_op5" "round" "" "%R5") +(define_subst_attr "round_op6" "round" "" "%R6") +(define_subst_attr "round_mask_op2" "round" "" "<round_mask_operand2>") +(define_subst_attr "round_mask_op3" "round" "" "<round_mask_operand3>") +(define_subst_attr "round_mask_scalar_op3" "round" "" "<round_mask_scalar_operand3>") +(define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>") +(define_subst_attr "round_constraint" "round" "vm" "v") +(define_subst_attr "round_constraint2" "round" "m" "v") +(define_subst_attr "round_constraint3" "round" "rm" "r") +(define_subst_attr "round_nimm_predicate" "round" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_prefix" "round" "vex" "evex") +(define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode || <MODE>mode == V8DFmode)") +(define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)") +(define_subst_attr "round_codefor" "round" "*" "") +(define_subst_attr "round_opnum" "round" "5" "6") + +(define_subst "round" + [(set (match_operand:SUBST_A 0) + (match_operand:SUBST_A 1))] + "TARGET_AVX512F" + [(parallel[ + (set (match_dup 0) + (match_dup 1)) + (unspec [(match_operand:SI 2 "const_0_to_4_operand")] UNSPEC_EMBEDDED_ROUNDING)])]) + +(define_subst_attr "round_saeonly_name" "round_saeonly" "" "_round") +(define_subst_attr "round_saeonly_mask_operand2" "mask" "%R2" "%R4") +(define_subst_attr "round_saeonly_mask_operand3" "mask" "%R3" "%R5") +(define_subst_attr "round_saeonly_mask_scalar_operand3" "mask_scalar" "%R3" "%R5") +(define_subst_attr "round_saeonly_mask_scalar_operand4" "mask_scalar" "%R4" "%R6") +(define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%R4" "%R5") +(define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%R5" "%R7") +(define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%R2") +(define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%R3") +(define_subst_attr "round_saeonly_op4" "round_saeonly" "" "%R4") +(define_subst_attr "round_saeonly_op5" "round_saeonly" "" "%R5") +(define_subst_attr "round_saeonly_op6" "round_saeonly" "" "%R6") +(define_subst_attr "round_saeonly_prefix" "round_saeonly" "vex" "evex") +(define_subst_attr "round_saeonly_mask_op2" "round_saeonly" "" "<round_saeonly_mask_operand2>") +(define_subst_attr "round_saeonly_mask_op3" "round_saeonly" "" "<round_saeonly_mask_operand3>") +(define_subst_attr "round_saeonly_mask_scalar_op3" "round_saeonly" "" "<round_saeonly_mask_scalar_operand3>") +(define_subst_attr "round_saeonly_mask_scalar_op4" "round_saeonly" "" "<round_saeonly_mask_scalar_operand4>") +(define_subst_attr "round_saeonly_mask_scalar_merge_op4" "round_saeonly" "" "<round_saeonly_mask_scalar_merge_operand4>") +(define_subst_attr "round_saeonly_sd_mask_op5" "round_saeonly" "" "<round_saeonly_sd_mask_operand5>") +(define_subst_attr "round_saeonly_constraint" "round_saeonly" "vm" "v") +(define_subst_attr "round_saeonly_constraint2" "round_saeonly" "m" "v") +(define_subst_attr "round_saeonly_nimm_predicate" "round_saeonly" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_saeonly_mode512bit_condition" "round_saeonly" "1" "(<MODE>mode == V16SFmode || <MODE>mode == V8DFmode)") + +(define_subst "round_saeonly" + [(set (match_operand:SUBST_A 0) + (match_operand:SUBST_A 1))] + "TARGET_AVX512F" + [(parallel[ + (set (match_dup 0) + (match_dup 1)) + (unspec [(match_operand:SI 2 "const_4_to_5_operand")] UNSPEC_EMBEDDED_ROUNDING)])]) + +(define_subst_attr "round_expand_name" "round_expand" "" "_round") +(define_subst_attr "round_expand_nimm_predicate" "round_expand" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_expand_operand" "round_expand" "" ", operands[5]") + +(define_subst "round_expand" + [(match_operand:SUBST_V 0) + (match_operand:SUBST_V 1) + (match_operand:SUBST_V 2) + (match_operand:SUBST_V 3) + (match_operand:SUBST_S 4)] + "TARGET_AVX512F" + [(match_dup 0) + (match_dup 1) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (unspec [(match_operand:SI 5 "const_0_to_4_operand")] UNSPEC_EMBEDDED_ROUNDING)]) + +(define_subst_attr "round_saeonly_expand_name" "round_saeonly_expand" "" "_round") +(define_subst_attr "round_saeonly_expand_nimm_predicate" "round_saeonly_expand" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_saeonly_expand_operand6" "round_saeonly_expand" "" ", operands[6]") + +(define_subst "round_saeonly_expand" + [(match_operand:SUBST_V 0) + (match_operand:SUBST_V 1) + (match_operand:SUBST_V 2) + (match_operand:SUBST_A 3) + (match_operand:SI 4) + (match_operand:SUBST_S 5)] + "TARGET_AVX512F" + [(match_dup 0) + (match_dup 1) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5) + (unspec [(match_operand:SI 6 "const_4_to_5_operand")] UNSPEC_EMBEDDED_ROUNDING)]) diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index 8408a2bfe43..4cd449ebf06 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -1,5 +1,5 @@ ;; GCC machine description for i386 synchronization instructions. -;; Copyright (C) 2005-2013 Free Software Foundation, Inc. +;; Copyright (C) 2005-2014 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; diff --git a/gcc/config/i386/sysv4.h b/gcc/config/i386/sysv4.h index ff24575d32c..011b228cad9 100644 --- a/gcc/config/i386/sysv4.h +++ b/gcc/config/i386/sysv4.h @@ -1,5 +1,5 @@ /* Target definitions for GCC for Intel 80386 running System V.4 - Copyright (C) 1991-2013 Free Software Foundation, Inc. + Copyright (C) 1991-2014 Free Software Foundation, Inc. Written by Ron Guilmette (rfg@netcom.com). diff --git a/gcc/config/i386/t-cygming b/gcc/config/i386/t-cygming index ba076a7f49a..9544e49144d 100644 --- a/gcc/config/i386/t-cygming +++ b/gcc/config/i386/t-cygming @@ -1,4 +1,4 @@ -# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# Copyright (C) 2003-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 1a76c4152f6..5168e6b401e 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -1,4 +1,4 @@ -# Copyright (C) 2008-2013 Free Software Foundation, Inc. +# Copyright (C) 2008-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/t-interix b/gcc/config/i386/t-interix index 4d7b5987037..24f5243f583 100644 --- a/gcc/config/i386/t-interix +++ b/gcc/config/i386/t-interix @@ -1,4 +1,4 @@ -# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2011-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/t-linux64 b/gcc/config/i386/t-linux64 index bcea0c68ad0..5ec8907a934 100644 --- a/gcc/config/i386/t-linux64 +++ b/gcc/config/i386/t-linux64 @@ -1,4 +1,4 @@ -# Copyright (C) 2002-2013 Free Software Foundation, Inc. +# Copyright (C) 2002-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/t-rtems b/gcc/config/i386/t-rtems index fef4c22e9c1..e3934179ec4 100644 --- a/gcc/config/i386/t-rtems +++ b/gcc/config/i386/t-rtems @@ -1,4 +1,4 @@ -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/t-sol2-64 b/gcc/config/i386/t-sol2-64 index c456da777b5..4e70f0bed27 100644 --- a/gcc/config/i386/t-sol2-64 +++ b/gcc/config/i386/t-sol2-64 @@ -1,4 +1,4 @@ -# Copyright (C) 2004-2013 Free Software Foundation, Inc. +# Copyright (C) 2004-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/tbmintrin.h b/gcc/config/i386/tbmintrin.h index 9235d6c713d..871f532803c 100644 --- a/gcc/config/i386/tbmintrin.h +++ b/gcc/config/i386/tbmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 Free Software Foundation, Inc. +/* Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/tmmintrin.h b/gcc/config/i386/tmmintrin.h index 3f63b4f8934..89556d24b21 100644 --- a/gcc/config/i386/tmmintrin.h +++ b/gcc/config/i386/tmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2013 Free Software Foundation, Inc. +/* Copyright (C) 2006-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/unix.h b/gcc/config/i386/unix.h index 0eeee4d6a01..d4fdf9b4baa 100644 --- a/gcc/config/i386/unix.h +++ b/gcc/config/i386/unix.h @@ -1,5 +1,5 @@ /* Definitions for Unix assembler syntax for the Intel 80386. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/vx-common.h b/gcc/config/i386/vx-common.h index 9bf7b7ce619..136c2d9af37 100644 --- a/gcc/config/i386/vx-common.h +++ b/gcc/config/i386/vx-common.h @@ -1,5 +1,5 @@ /* IA32 VxWorks and VxWorks AE target definitions. - Copyright (C) 2007-2013 Free Software Foundation, Inc. + Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h index 6fcd5efc841..49206e01550 100644 --- a/gcc/config/i386/vxworks.h +++ b/gcc/config/i386/vxworks.h @@ -1,5 +1,5 @@ /* IA32 VxWorks target definitions for GNU compiler. - Copyright (C) 2003-2013 Free Software Foundation, Inc. + Copyright (C) 2003-2014 Free Software Foundation, Inc. Updated by CodeSourcery, LLC. This file is part of GCC. diff --git a/gcc/config/i386/vxworksae.h b/gcc/config/i386/vxworksae.h index 820ca93247f..bb63c079c25 100644 --- a/gcc/config/i386/vxworksae.h +++ b/gcc/config/i386/vxworksae.h @@ -1,5 +1,5 @@ /* IA32 VxWorks AE target definitions for GNU compiler. - Copyright (C) 2005-2013 Free Software Foundation, Inc. + Copyright (C) 2005-2014 Free Software Foundation, Inc. Contributed by CodeSourcery, LLC. This file is part of GCC. diff --git a/gcc/config/i386/winnt-cxx.c b/gcc/config/i386/winnt-cxx.c index d466299abed..aa75f9157ed 100644 --- a/gcc/config/i386/winnt-cxx.c +++ b/gcc/config/i386/winnt-cxx.c @@ -1,6 +1,6 @@ /* Target support for C++ classes on Windows. Contributed by Danny Smith (dannysmith@users.sourceforge.net) - Copyright (C) 2005-2013 Free Software Foundation, Inc. + Copyright (C) 2005-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/winnt-stubs.c b/gcc/config/i386/winnt-stubs.c index 78322654fd5..30321d0f73e 100644 --- a/gcc/config/i386/winnt-stubs.c +++ b/gcc/config/i386/winnt-stubs.c @@ -1,6 +1,6 @@ /* Dummy subroutines for language-specific support on Windows. Contributed by Danny Smith (dannysmith@users.sourceforge.net) - Copyright (C) 2005-2013 Free Software Foundation, Inc. + Copyright (C) 2005-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index 55b38d7b4af..bcfd48a03dc 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -1,6 +1,6 @@ /* Subroutines for insn-output.c for Windows NT. Contributed by Douglas Rupp (drupp@cs.washington.edu) - Copyright (C) 1995-2013 Free Software Foundation, Inc. + Copyright (C) 1995-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/wmmintrin.h b/gcc/config/i386/wmmintrin.h index defcfd82acc..2002375c6cd 100644 --- a/gcc/config/i386/wmmintrin.h +++ b/gcc/config/i386/wmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Free Software Foundation, Inc. +/* Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/x-mingw32 b/gcc/config/i386/x-mingw32 index 4d3ec48289d..333346018ce 100644 --- a/gcc/config/i386/x-mingw32 +++ b/gcc/config/i386/x-mingw32 @@ -1,4 +1,4 @@ -# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# Copyright (C) 2003-2014 Free Software Foundation, Inc. # # This file is part of GCC. # diff --git a/gcc/config/i386/x86-64.h b/gcc/config/i386/x86-64.h index 0c62723ae22..16fc6858164 100644 --- a/gcc/config/i386/x86-64.h +++ b/gcc/config/i386/x86-64.h @@ -1,5 +1,5 @@ /* OS independent definitions for AMD x86-64. - Copyright (C) 2001-2013 Free Software Foundation, Inc. + Copyright (C) 2001-2014 Free Software Foundation, Inc. Contributed by Bo Thorsen <bo@suse.de>. This file is part of GCC. diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 4c13c3a0ec6..ec96a4b2617 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -1,5 +1,5 @@ /* Definitions of x86 tunable features. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of GCC. @@ -40,7 +40,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* X86_TUNE_SCHEDULE: Enable scheduling. */ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", - m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE + m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming @@ -48,7 +48,7 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", over partial stores. For example preffer MOVZBL or MOVQ to load 8bit value over movb. */ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", - m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE + m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store @@ -58,7 +58,7 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", SPECfp regression, while enabling it on K8 brings roughly 2.4% regression that can be partly masked by careful scheduling of moves. */ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_AMDFAM10 | m_BDVER | m_GENERIC) /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies @@ -84,13 +84,13 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid partial dependencies. */ DEF_TUNE (X86_TUNE_MOVX, "movx", - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by full sized loads. */ DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", - m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent conditional jump instruction for 32 bit TARGET. @@ -102,29 +102,29 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", conditional jump instruction for TARGET_64BIT. FIXME: revisit for generic. */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", - m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER) + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a subsequent conditional jump instruction when the condition jump check sign flag (SF) or overflow flag (OF). */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", - m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER) + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by the conditional jump instruction. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_COREI7_AVX | m_HASWELL) + m_SANDYBRIDGE | m_HASWELL) /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations during reassociation of integer computation. */ DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", - m_ATOM) + m_BONNELL) /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations during reassociation of fp computation. */ DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", - m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC) + m_BONNELL | m_SILVERMONT | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */ @@ -142,33 +142,33 @@ DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", Bobcat and Generic. This is because disabling it causes large regression on mgrid due to IRA limitation leading to unecessary use of the frame pointer in 32bit mode. */ -DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", - m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) +DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", + m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are considered on critical path. */ -DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", +DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", m_PPRO | m_ATHLON_K8) /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are considered on critical path. */ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", - m_PPRO | m_ATHLON_K8) + m_PPRO | m_ATHLON_K8) /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ -DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", +DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. Some chips, like 486 and Pentium works faster with separate load and push instructions. */ -DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", - m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE +DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", + m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred over esp subtraction. */ -DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT +DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT | m_K6_GEODE) /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred @@ -189,7 +189,7 @@ DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 instructions long. */ -DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) +DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL) /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination of conditional jump or directly preceded by other jump instruction. @@ -202,7 +202,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", - m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_ATHLON_K8 | m_AMDFAM10) + m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_ATHLON_K8 | m_AMDFAM10) /*****************************************************************************/ /* Integer instruction selection tuning */ @@ -224,26 +224,26 @@ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", - ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) + ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_GENERIC)) /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred for DFmode copies */ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", - ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM + ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ -DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) +DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT) /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is - vector path on AMD machines. + vector path on AMD machines. FIXME: Do we need to enable this for core? */ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", m_K8 | m_AMDFAM10) /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD - machines. + machines. FIXME: Do we need to enable this for core? */ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", m_K8 | m_AMDFAM10) @@ -251,7 +251,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", - m_ATOM | m_SLM) + m_BONNELL | m_SILVERMONT) /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ @@ -268,15 +268,15 @@ DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, /* X86_TUNE_USE_SAHF: Controls use of SAHF. */ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC) /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ -DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6)) +DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_BONNELL | m_SILVERMONT | m_K6)) /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", - m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_AMD_MULTIPLE | m_GENERIC) /*****************************************************************************/ /* 387 instruction selection tuning */ @@ -285,21 +285,21 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit integer operand. FIXME: Why this is disabled for modern chips? */ -DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", +DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", m_386 | m_486 | m_K6_GEODE) /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit integer operand. */ DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", - ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM - | m_SLM | m_AMD_MULTIPLE | m_GENERIC)) + ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL + | m_SILVERMONT | m_AMD_MULTIPLE | m_GENERIC)) /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) /*****************************************************************************/ @@ -308,7 +308,7 @@ DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector instructions. */ -DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM) +DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_BONNELL) /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE regs instead of memory. */ @@ -318,12 +318,12 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", - m_COREI7 | m_COREI7_AVX | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM | m_GENERIC) + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER | m_SILVERMONT | m_GENERIC) /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", - m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER | m_SLM | m_GENERIC) + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT | m_GENERIC) /* Use packed single precision instructions where posisble. I.e. movups instead of movupd. */ @@ -360,7 +360,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for fp converts to destination register. */ DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", - m_SLM) + m_SILVERMONT) /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion from FP to FP. This form of instructions avoids partial write to the @@ -378,13 +378,13 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are split. */ -DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", - ~(m_COREI7 | m_COREI7_AVX | m_GENERIC)) +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", + ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC)) /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are split. */ -DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", - ~(m_COREI7 | m_COREI7_AVX | m_BDVER | m_GENERIC)) +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", + ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_GENERIC)) /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ @@ -401,7 +401,7 @@ DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386) /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, such as fsqrt, fprem, fsin, fcos, fsincos etc. Should be enabled for all targets that always has coprocesor. */ -DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", +DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", ~(m_386 | m_486)) /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for @@ -503,3 +503,9 @@ DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0) arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme is usually used for RISC targets. */ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0) + +/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based + on hardware capabilities. Bdver3 hardware has a loop buffer which makes + unrolling small loop less important. For, such architectures we adjust + the unroll factor so that the unrolled loop fits the loop buffer. */ +DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h index 46ced969a9f..80e9e6f33de 100644 --- a/gcc/config/i386/x86intrin.h +++ b/gcc/config/i386/x86intrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Free Software Foundation, Inc. +/* Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xm-cygwin.h b/gcc/config/i386/xm-cygwin.h index 259fe18e89e..d66a46df51c 100644 --- a/gcc/config/i386/xm-cygwin.h +++ b/gcc/config/i386/xm-cygwin.h @@ -1,6 +1,6 @@ /* Configuration for GCC for hosting on Windows NT. using a unix style C library. - Copyright (C) 1995-2013 Free Software Foundation, Inc. + Copyright (C) 1995-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xm-djgpp.h b/gcc/config/i386/xm-djgpp.h index 84470510b0c..2f7989c164f 100644 --- a/gcc/config/i386/xm-djgpp.h +++ b/gcc/config/i386/xm-djgpp.h @@ -1,5 +1,5 @@ /* Configuration for GCC for Intel 80386 running DJGPP. - Copyright (C) 1988-2013 Free Software Foundation, Inc. + Copyright (C) 1988-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xm-mingw32.h b/gcc/config/i386/xm-mingw32.h index 3d90dec7749..b6d87a42ae8 100644 --- a/gcc/config/i386/xm-mingw32.h +++ b/gcc/config/i386/xm-mingw32.h @@ -1,6 +1,6 @@ /* Configuration for GCC for hosting on Windows32. using GNU tools and the Windows32 API Library. - Copyright (C) 1997-2013 Free Software Foundation, Inc. + Copyright (C) 1997-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 14d1e7fe2b0..0511dcfc532 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2013 Free Software Foundation, Inc. +/* Copyright (C) 2002-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xopintrin.h b/gcc/config/i386/xopintrin.h index 49cea8ec645..cc82bc5fa24 100644 --- a/gcc/config/i386/xopintrin.h +++ b/gcc/config/i386/xopintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 Free Software Foundation, Inc. +/* Copyright (C) 2007-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xsaveintrin.h b/gcc/config/i386/xsaveintrin.h index 31c17b1d2c5..47be25f0c91 100644 --- a/gcc/config/i386/xsaveintrin.h +++ b/gcc/config/i386/xsaveintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xsaveoptintrin.h b/gcc/config/i386/xsaveoptintrin.h index aa9538da33e..d7534b41c15 100644 --- a/gcc/config/i386/xsaveoptintrin.h +++ b/gcc/config/i386/xsaveoptintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/i386/xtestintrin.h b/gcc/config/i386/xtestintrin.h index a6afa896b4f..ba79e5c5ee6 100644 --- a/gcc/config/i386/xtestintrin.h +++ b/gcc/config/i386/xtestintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 Free Software Foundation, Inc. +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of GCC. |