diff options
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/i386-c.c | 5 | ||||
-rw-r--r-- | gcc/config/i386/i386-protos.h | 4 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 1024 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 71 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 64 | ||||
-rw-r--r-- | gcc/config/i386/i386.opt | 6 | ||||
-rw-r--r-- | gcc/config/i386/predicates.md | 16 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 2233 | ||||
-rw-r--r-- | gcc/config/i386/t-i386 | 19 | ||||
-rw-r--r-- | gcc/config/i386/winnt.c | 4 | ||||
-rw-r--r-- | gcc/config/i386/x-darwin | 7 | ||||
-rw-r--r-- | gcc/config/i386/x-i386 | 7 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 279 |
13 files changed, 2863 insertions, 876 deletions
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 2e764e79987..8a41fb0ddf7 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -368,7 +368,7 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, static bool ix86_pragma_target_parse (tree args, tree pop_target) { - tree prev_tree = build_target_option_node (); + tree prev_tree = build_target_option_node (&global_options); tree cur_tree; struct cl_target_option *prev_opt; struct cl_target_option *cur_opt; @@ -388,7 +388,8 @@ ix86_pragma_target_parse (tree args, tree pop_target) } else { - cur_tree = ix86_valid_target_attribute_tree (args); + cur_tree = ix86_valid_target_attribute_tree (args, &global_options, + &global_options_set); if (!cur_tree || cur_tree == error_mark_node) { cl_target_option_restore (&global_options, diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 3ab2f3a2ac8..df388765497 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -220,7 +220,9 @@ extern int ix86_constant_alignment (tree, int); extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *); extern tree ix86_handle_selectany_attribute (tree *, tree, tree, int, bool *); extern int x86_field_alignment (tree, int); -extern tree ix86_valid_target_attribute_tree (tree); +extern tree ix86_valid_target_attribute_tree (tree, + struct gcc_options *, + struct gcc_options *); extern unsigned int ix86_get_callcvt (const_tree); #endif diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 8e6feb939a5..4b69d4bf026 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -61,7 +61,6 @@ along with GCC; see the file COPYING3. If not see #include "diagnostic.h" #include "dumpfile.h" #include "tree-pass.h" -#include "tree-flow.h" #include "wide-int.h" #include "context.h" #include "pass_manager.h" @@ -1899,18 +1898,6 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { ~m_386, }; -static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_arch_always_fancy_math_387 - = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_load - = m_COREI7 | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_store - = m_COREI7 | m_BDVER | m_GENERIC; - /* In case the average insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ @@ -2249,12 +2236,16 @@ enum ix86_function_specific_strings static char *ix86_target_string (HOST_WIDE_INT, int, const char *, const char *, enum fpmath_unit, bool); -static void ix86_function_specific_save (struct cl_target_option *); -static void ix86_function_specific_restore (struct cl_target_option *); +static void ix86_function_specific_save (struct cl_target_option *, + struct gcc_options *opts); +static void ix86_function_specific_restore (struct gcc_options *opts, + struct cl_target_option *); static void ix86_function_specific_print (FILE *, int, struct cl_target_option *); static bool ix86_valid_target_attribute_p (tree, tree, tree, int); static bool ix86_valid_target_attribute_inner_p (tree, char *[], + struct gcc_options *, + struct gcc_options *, struct gcc_options *); static bool ix86_can_inline_p (tree, tree); static void ix86_set_current_function (tree); @@ -2918,11 +2909,13 @@ set_ix86_tune_features (enum processor_type ix86_tune, bool dump) attributes. */ static void -ix86_option_override_internal (bool main_args_p) +ix86_option_override_internal (bool main_args_p, + struct gcc_options *opts, + struct gcc_options *opts_set) { int i; - unsigned int ix86_arch_mask, ix86_tune_mask; - const bool ix86_tune_specified = (ix86_tune_string != NULL); + unsigned int ix86_arch_mask; + const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL); const char *prefix; const char *suffix; const char *sw; @@ -3099,7 +3092,7 @@ ix86_option_override_internal (bool main_args_p) {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX + | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE}, @@ -3154,8 +3147,8 @@ ix86_option_override_internal (bool main_args_p) /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */ - if (TARGET_64BIT_DEFAULT && !TARGET_64BIT) - ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); + if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); #ifdef TARGET_BI_ARCH else { @@ -3164,32 +3157,32 @@ ix86_option_override_internal (bool main_args_p) is on and OPTION_MASK_ABI_X32 is off. We turn off OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by -mx32. */ - if (TARGET_X32) - ix86_isa_flags &= ~OPTION_MASK_ABI_64; + if (TARGET_X32_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; #else /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is on and OPTION_MASK_ABI_64 is off. We turn off OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by -m64. */ - if (TARGET_LP64) - ix86_isa_flags &= ~OPTION_MASK_ABI_X32; + if (TARGET_LP64_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; #endif } #endif - if (TARGET_X32) + if (TARGET_X32_P (opts->x_ix86_isa_flags)) { /* Always turn on OPTION_MASK_ISA_64BIT and turn off OPTION_MASK_ABI_64 for TARGET_X32. */ - ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - ix86_isa_flags &= ~OPTION_MASK_ABI_64; + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; } - else if (TARGET_LP64) + else if (TARGET_LP64_P (opts->x_ix86_isa_flags)) { /* Always turn on OPTION_MASK_ISA_64BIT and turn off OPTION_MASK_ABI_X32 for TARGET_LP64. */ - ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - ix86_isa_flags &= ~OPTION_MASK_ABI_X32; + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; } #ifdef SUBTARGET_OVERRIDE_OPTIONS @@ -3201,138 +3194,144 @@ ix86_option_override_internal (bool main_args_p) #endif /* -fPIC is the default for x86_64. */ - if (TARGET_MACHO && TARGET_64BIT) - flag_pic = 2; + if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_flag_pic = 2; /* Need to check -mtune=generic first. */ - if (ix86_tune_string) + if (opts->x_ix86_tune_string) { - if (!strcmp (ix86_tune_string, "generic") - || !strcmp (ix86_tune_string, "i686") + if (!strcmp (opts->x_ix86_tune_string, "generic") + || !strcmp (opts->x_ix86_tune_string, "i686") /* As special support for cross compilers we read -mtune=native as -mtune=generic. With native compilers we won't see the -mtune=native, as it was changed by the driver. */ - || !strcmp (ix86_tune_string, "native")) + || !strcmp (opts->x_ix86_tune_string, "native")) { - ix86_tune_string = "generic"; + opts->x_ix86_tune_string = "generic"; } /* If this call is for setting the option attribute, allow the generic that was previously set. */ else if (!main_args_p - && !strcmp (ix86_tune_string, "generic")) + && !strcmp (opts->x_ix86_tune_string, "generic")) ; - else if (!strncmp (ix86_tune_string, "generic", 7)) + else if (!strncmp (opts->x_ix86_tune_string, "generic", 7)) error ("bad value (%s) for %stune=%s %s", - ix86_tune_string, prefix, suffix, sw); - else if (!strcmp (ix86_tune_string, "x86-64")) + opts->x_ix86_tune_string, prefix, suffix, sw); + else if (!strcmp (opts->x_ix86_tune_string, "x86-64")) warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use " "%stune=k8%s or %stune=generic%s instead as appropriate", prefix, suffix, prefix, suffix, prefix, suffix); } else { - if (ix86_arch_string) - ix86_tune_string = ix86_arch_string; - if (!ix86_tune_string) + if (opts->x_ix86_arch_string) + opts->x_ix86_tune_string = opts->x_ix86_arch_string; + if (!opts->x_ix86_tune_string) { - ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT]; + opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT]; ix86_tune_defaulted = 1; } - /* ix86_tune_string is set to ix86_arch_string or defaulted. We - need to use a sensible tune option. */ - if (!strcmp (ix86_tune_string, "generic") - || !strcmp (ix86_tune_string, "x86-64") - || !strcmp (ix86_tune_string, "i686")) + /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string + or defaulted. We need to use a sensible tune option. */ + if (!strcmp (opts->x_ix86_tune_string, "generic") + || !strcmp (opts->x_ix86_tune_string, "x86-64") + || !strcmp (opts->x_ix86_tune_string, "i686")) { - ix86_tune_string = "generic"; + opts->x_ix86_tune_string = "generic"; } } - if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT) + if (opts->x_ix86_stringop_alg == rep_prefix_8_byte + && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) { /* rep; movq isn't available in 32-bit code. */ error ("-mstringop-strategy=rep_8byte not supported for 32-bit code"); - ix86_stringop_alg = no_stringop; + opts->x_ix86_stringop_alg = no_stringop; } - if (!ix86_arch_string) - ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU; + if (!opts->x_ix86_arch_string) + opts->x_ix86_arch_string + = TARGET_64BIT_P (opts->x_ix86_isa_flags) + ? "x86-64" : SUBTARGET32_DEFAULT_CPU; else ix86_arch_specified = 1; - if (global_options_set.x_ix86_pmode) + if (opts_set->x_ix86_pmode) { - if ((TARGET_LP64 && ix86_pmode == PMODE_SI) - || (!TARGET_64BIT && ix86_pmode == PMODE_DI)) + if ((TARGET_LP64_P (opts->x_ix86_isa_flags) + && opts->x_ix86_pmode == PMODE_SI) + || (!TARGET_64BIT_P (opts->x_ix86_isa_flags) + && opts->x_ix86_pmode == PMODE_DI)) error ("address mode %qs not supported in the %s bit mode", - TARGET_64BIT ? "short" : "long", - TARGET_64BIT ? "64" : "32"); + TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long", + TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32"); } else - ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI; + opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags) + ? PMODE_DI : PMODE_SI; - if (!global_options_set.x_ix86_abi) - ix86_abi = DEFAULT_ABI; + if (!opts_set->x_ix86_abi) + opts->x_ix86_abi = DEFAULT_ABI; /* For targets using ms ABI enable ms-extensions, if not explicit turned off. For non-ms ABI we turn off this option. */ - if (!global_options_set.x_flag_ms_extensions) - flag_ms_extensions = (MS_ABI == DEFAULT_ABI); + if (!opts_set->x_flag_ms_extensions) + opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI); - if (global_options_set.x_ix86_cmodel) + if (opts_set->x_ix86_cmodel) { - switch (ix86_cmodel) + switch (opts->x_ix86_cmodel) { case CM_SMALL: case CM_SMALL_PIC: - if (flag_pic) - ix86_cmodel = CM_SMALL_PIC; - if (!TARGET_64BIT) + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_SMALL_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in the %s bit mode", "small", "32"); break; case CM_MEDIUM: case CM_MEDIUM_PIC: - if (flag_pic) - ix86_cmodel = CM_MEDIUM_PIC; - if (!TARGET_64BIT) + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_MEDIUM_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in the %s bit mode", "medium", "32"); - else if (TARGET_X32) + else if (TARGET_X32_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in x32 mode", "medium"); break; case CM_LARGE: case CM_LARGE_PIC: - if (flag_pic) - ix86_cmodel = CM_LARGE_PIC; - if (!TARGET_64BIT) + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_LARGE_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in the %s bit mode", "large", "32"); - else if (TARGET_X32) + else if (TARGET_X32_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in x32 mode", "large"); break; case CM_32: - if (flag_pic) + if (opts->x_flag_pic) error ("code model %s does not support PIC mode", "32"); - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in the %s bit mode", "32", "64"); break; case CM_KERNEL: - if (flag_pic) + if (opts->x_flag_pic) { error ("code model %s does not support PIC mode", "kernel"); - ix86_cmodel = CM_32; + opts->x_ix86_cmodel = CM_32; } - if (!TARGET_64BIT) + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) error ("code model %qs not supported in the %s bit mode", "kernel", "32"); break; @@ -3347,191 +3346,195 @@ ix86_option_override_internal (bool main_args_p) use of rip-relative addressing. This eliminates fixups that would otherwise be needed if this object is to be placed in a DLL, and is essentially just as efficient as direct addressing. */ - if (TARGET_64BIT && (TARGET_RDOS || TARGET_PECOFF)) - ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1; - else if (TARGET_64BIT) - ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL; + if (TARGET_64BIT_P (opts->x_ix86_isa_flags) + && (TARGET_RDOS || TARGET_PECOFF)) + opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1; + else if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL; else - ix86_cmodel = CM_32; + opts->x_ix86_cmodel = CM_32; } - if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL) + if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL) { error ("-masm=intel not supported in this configuration"); - ix86_asm_dialect = ASM_ATT; + opts->x_ix86_asm_dialect = ASM_ATT; } - if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) + if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0) + != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) sorry ("%i-bit mode not compiled in", - (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); + (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); for (i = 0; i < pta_size; i++) - if (! strcmp (ix86_arch_string, processor_alias_table[i].name)) + if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name)) { ix86_schedule = processor_alias_table[i].schedule; ix86_arch = processor_alias_table[i].processor; /* Default cpu tuning to the architecture. */ ix86_tune = ix86_arch; - if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags) + && !(processor_alias_table[i].flags & PTA_64BIT)) error ("CPU you selected does not support x86-64 " "instruction set"); if (processor_alias_table[i].flags & PTA_MMX - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) - ix86_isa_flags |= OPTION_MASK_ISA_MMX; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX; if (processor_alias_table[i].flags & PTA_3DNOW - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) - ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; if (processor_alias_table[i].flags & PTA_3DNOW_A - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) - ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; if (processor_alias_table[i].flags & PTA_SSE - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE; if (processor_alias_table[i].flags & PTA_SSE2 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE2; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2; if (processor_alias_table[i].flags & PTA_SSE3 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE3; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3; if (processor_alias_table[i].flags & PTA_SSSE3 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) - ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; if (processor_alias_table[i].flags & PTA_SSE4_1 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; if (processor_alias_table[i].flags & PTA_SSE4_2 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; if (processor_alias_table[i].flags & PTA_AVX - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX; if (processor_alias_table[i].flags & PTA_AVX2 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX2; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2; if (processor_alias_table[i].flags & PTA_FMA - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) - ix86_isa_flags |= OPTION_MASK_ISA_FMA; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA; if (processor_alias_table[i].flags & PTA_SSE4A - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; if (processor_alias_table[i].flags & PTA_FMA4 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) - ix86_isa_flags |= OPTION_MASK_ISA_FMA4; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4; if (processor_alias_table[i].flags & PTA_XOP - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) - ix86_isa_flags |= OPTION_MASK_ISA_XOP; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP; if (processor_alias_table[i].flags & PTA_LWP - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) - ix86_isa_flags |= OPTION_MASK_ISA_LWP; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP; if (processor_alias_table[i].flags & PTA_ABM - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) - ix86_isa_flags |= OPTION_MASK_ISA_ABM; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM; if (processor_alias_table[i].flags & PTA_BMI - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) - ix86_isa_flags |= OPTION_MASK_ISA_BMI; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI; if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) - ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; if (processor_alias_table[i].flags & PTA_TBM - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) - ix86_isa_flags |= OPTION_MASK_ISA_TBM; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM; if (processor_alias_table[i].flags & PTA_BMI2 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) - ix86_isa_flags |= OPTION_MASK_ISA_BMI2; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2; if (processor_alias_table[i].flags & PTA_CX16 - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16)) - ix86_isa_flags |= OPTION_MASK_ISA_CX16; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16; if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) - ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; - if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) - ix86_isa_flags |= OPTION_MASK_ISA_SAHF; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; + if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) + && (processor_alias_table[i].flags & PTA_NO_SAHF)) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF; if (processor_alias_table[i].flags & PTA_MOVBE - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE)) - ix86_isa_flags |= OPTION_MASK_ISA_MOVBE; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE; if (processor_alias_table[i].flags & PTA_AES - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) - ix86_isa_flags |= OPTION_MASK_ISA_AES; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES; if (processor_alias_table[i].flags & PTA_PCLMUL - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) - ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; if (processor_alias_table[i].flags & PTA_FSGSBASE - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) - ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; if (processor_alias_table[i].flags & PTA_RDRND - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) - ix86_isa_flags |= OPTION_MASK_ISA_RDRND; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND; if (processor_alias_table[i].flags & PTA_F16C - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) - ix86_isa_flags |= OPTION_MASK_ISA_F16C; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C; if (processor_alias_table[i].flags & PTA_RTM - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) - ix86_isa_flags |= OPTION_MASK_ISA_RTM; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM; if (processor_alias_table[i].flags & PTA_HLE - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE)) - ix86_isa_flags |= OPTION_MASK_ISA_HLE; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE; if (processor_alias_table[i].flags & PTA_PRFCHW - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) - ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; if (processor_alias_table[i].flags & PTA_RDSEED - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) - ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; if (processor_alias_table[i].flags & PTA_ADX - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) - ix86_isa_flags |= OPTION_MASK_ISA_ADX; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX; if (processor_alias_table[i].flags & PTA_FXSR - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) - ix86_isa_flags |= OPTION_MASK_ISA_FXSR; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR; if (processor_alias_table[i].flags & PTA_XSAVE - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) - ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; if (processor_alias_table[i].flags & PTA_XSAVEOPT - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) - ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; if (processor_alias_table[i].flags & PTA_AVX512F - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; if (processor_alias_table[i].flags & PTA_AVX512ER - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; if (processor_alias_table[i].flags & PTA_AVX512PF - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; if (processor_alias_table[i].flags & PTA_AVX512CD - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) - ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)) x86_prefetch_sse = true; break; } - if (!strcmp (ix86_arch_string, "generic")) + if (!strcmp (opts->x_ix86_arch_string, "generic")) error ("generic CPU can be used only for %stune=%s %s", prefix, suffix, sw); - else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size) + else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size) error ("bad value (%s) for %sarch=%s %s", - ix86_arch_string, prefix, suffix, sw); + opts->x_ix86_arch_string, prefix, suffix, sw); ix86_arch_mask = 1u << ix86_arch; for (i = 0; i < X86_ARCH_LAST; ++i) ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); for (i = 0; i < pta_size; i++) - if (! strcmp (ix86_tune_string, processor_alias_table[i].name)) + if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) { ix86_schedule = processor_alias_table[i].schedule; ix86_tune = processor_alias_table[i].processor; - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) { if (!(processor_alias_table[i].flags & PTA_64BIT)) { if (ix86_tune_defaulted) { - ix86_tune_string = "x86-64"; + opts->x_ix86_tune_string = "x86-64"; for (i = 0; i < pta_size; i++) - if (! strcmp (ix86_tune_string, + if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) break; ix86_schedule = processor_alias_table[i].schedule; @@ -3555,9 +3558,9 @@ ix86_option_override_internal (bool main_args_p) if (ix86_tune_specified && i == pta_size) error ("bad value (%s) for %stune=%s %s", - ix86_tune_string, prefix, suffix, sw); + opts->x_ix86_tune_string, prefix, suffix, sw); - set_ix86_tune_features (ix86_tune, ix86_dump_tunes); + set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes); #ifndef USE_IX86_FRAME_POINTER #define USE_IX86_FRAME_POINTER 0 @@ -3569,27 +3572,29 @@ ix86_option_override_internal (bool main_args_p) /* Set the default values for switches whose default depends on TARGET_64BIT in case they weren't overwritten by command line options. */ - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) { - if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer) - flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; - if (flag_asynchronous_unwind_tables == 2) - flag_unwind_tables = flag_asynchronous_unwind_tables = 1; - if (flag_pcc_struct_return == 2) - flag_pcc_struct_return = 0; + if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) + opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; + if (opts->x_flag_asynchronous_unwind_tables == 2) + opts->x_flag_unwind_tables + = opts->x_flag_asynchronous_unwind_tables = 1; + if (opts->x_flag_pcc_struct_return == 2) + opts->x_flag_pcc_struct_return = 0; } else { - if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer) - flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size); - if (flag_asynchronous_unwind_tables == 2) - flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; - if (flag_pcc_struct_return == 2) - flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; + if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) + opts->x_flag_omit_frame_pointer + = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size); + if (opts->x_flag_asynchronous_unwind_tables == 2) + opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; + if (opts->x_flag_pcc_struct_return == 2) + opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; } ix86_tune_cost = processor_target_table[ix86_tune].cost; - if (optimize_size) + if (opts->x_optimize_size) ix86_cost = &ix86_size_cost; else ix86_cost = ix86_tune_cost; @@ -3598,148 +3603,160 @@ ix86_option_override_internal (bool main_args_p) init_machine_status = ix86_init_machine_status; /* Validate -mregparm= value. */ - if (global_options_set.x_ix86_regparm) + if (opts_set->x_ix86_regparm) { - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) warning (0, "-mregparm is ignored in 64-bit mode"); - if (ix86_regparm > REGPARM_MAX) + if (opts->x_ix86_regparm > REGPARM_MAX) { error ("-mregparm=%d is not between 0 and %d", - ix86_regparm, REGPARM_MAX); - ix86_regparm = 0; + opts->x_ix86_regparm, REGPARM_MAX); + opts->x_ix86_regparm = 0; } } - if (TARGET_64BIT) - ix86_regparm = REGPARM_MAX; + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_regparm = REGPARM_MAX; /* Default align_* from the processor table. */ - if (align_loops == 0) + if (opts->x_align_loops == 0) { - align_loops = processor_target_table[ix86_tune].align_loop; + opts->x_align_loops = processor_target_table[ix86_tune].align_loop; align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip; } - if (align_jumps == 0) + if (opts->x_align_jumps == 0) { - align_jumps = processor_target_table[ix86_tune].align_jump; + opts->x_align_jumps = processor_target_table[ix86_tune].align_jump; align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip; } - if (align_functions == 0) + if (opts->x_align_functions == 0) { - align_functions = processor_target_table[ix86_tune].align_func; + opts->x_align_functions = processor_target_table[ix86_tune].align_func; } /* Provide default for -mbranch-cost= value. */ - if (!global_options_set.x_ix86_branch_cost) - ix86_branch_cost = ix86_cost->branch_cost; + if (!opts_set->x_ix86_branch_cost) + opts->x_ix86_branch_cost = ix86_cost->branch_cost; - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) { - target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit; + opts->x_target_flags + |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags; /* Enable by default the SSE and MMX builtins. Do allow the user to explicitly disable any of these. In particular, disabling SSE and MMX for kernel code is extremely useful. */ if (!ix86_arch_specified) - ix86_isa_flags + opts->x_ix86_isa_flags |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX - | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit); + | TARGET_SUBTARGET64_ISA_DEFAULT) + & ~opts->x_ix86_isa_flags_explicit); - if (TARGET_RTD) + if (TARGET_RTD_P (opts->x_target_flags)) warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix); } else { - target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit; + opts->x_target_flags + |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags; if (!ix86_arch_specified) - ix86_isa_flags - |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit; + opts->x_ix86_isa_flags + |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; /* i386 ABI does not specify red zone. It still makes sense to use it when programmer takes care to stack from being destroyed. */ - if (!(target_flags_explicit & MASK_NO_RED_ZONE)) - target_flags |= MASK_NO_RED_ZONE; + if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE)) + opts->x_target_flags |= MASK_NO_RED_ZONE; } /* Keep nonleaf frame pointers. */ - if (flag_omit_frame_pointer) - target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; - else if (TARGET_OMIT_LEAF_FRAME_POINTER) - flag_omit_frame_pointer = 1; + if (opts->x_flag_omit_frame_pointer) + opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; + else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags)) + opts->x_flag_omit_frame_pointer = 1; /* If we're doing fast math, we don't care about comparison order wrt NaNs. This lets us use a shorter comparison sequence. */ - if (flag_finite_math_only) - target_flags &= ~MASK_IEEE_FP; + if (opts->x_flag_finite_math_only) + opts->x_target_flags &= ~MASK_IEEE_FP; /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, since the insns won't need emulation. */ - if (x86_arch_always_fancy_math_387 & ix86_arch_mask) - target_flags &= ~MASK_NO_FANCY_MATH_387; + if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) + opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387; /* Likewise, if the target doesn't have a 387, or we've specified software floating point, don't use 387 inline intrinsics. */ - if (!TARGET_80387) - target_flags |= MASK_NO_FANCY_MATH_387; + if (!TARGET_80387_P (opts->x_target_flags)) + opts->x_target_flags |= MASK_NO_FANCY_MATH_387; /* Turn on MMX builtins for -msse. */ - if (TARGET_SSE) - ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit; + if (TARGET_SSE_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit; /* Enable SSE prefetch. */ - if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW)) + if (TARGET_SSE_P (opts->x_ix86_isa_flags) + || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))) x86_prefetch_sse = true; /* Enable prefetch{,w} instructions for -m3dnow. */ - if (TARGET_3DNOW) - ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit; + if (TARGET_3DNOW_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit; /* Enable popcnt instruction for -msse4.2 or -mabm. */ - if (TARGET_SSE4_2 || TARGET_ABM) - ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit; + if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags) + || TARGET_ABM_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit; /* Enable lzcnt instruction for -mabm. */ - if (TARGET_ABM) - ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit; + if (TARGET_ABM_P(opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit; /* Validate -mpreferred-stack-boundary= value or default it to PREFERRED_STACK_BOUNDARY_DEFAULT. */ ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; - if (global_options_set.x_ix86_preferred_stack_boundary_arg) + if (opts_set->x_ix86_preferred_stack_boundary_arg) { - int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2); + int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags) + ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2); int max = (TARGET_SEH ? 4 : 12); - if (ix86_preferred_stack_boundary_arg < min - || ix86_preferred_stack_boundary_arg > max) + if (opts->x_ix86_preferred_stack_boundary_arg < min + || opts->x_ix86_preferred_stack_boundary_arg > max) { if (min == max) error ("-mpreferred-stack-boundary is not supported " "for this target"); else error ("-mpreferred-stack-boundary=%d is not between %d and %d", - ix86_preferred_stack_boundary_arg, min, max); + opts->x_ix86_preferred_stack_boundary_arg, min, max); } else ix86_preferred_stack_boundary - = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT; + = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT; } /* Set the default value for -mstackrealign. */ - if (ix86_force_align_arg_pointer == -1) - ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + if (opts->x_ix86_force_align_arg_pointer == -1) + opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; /* Validate -mincoming-stack-boundary= value or default it to MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; - if (global_options_set.x_ix86_incoming_stack_boundary_arg) + if (opts_set->x_ix86_incoming_stack_boundary_arg) { - if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2) + if (ix86_incoming_stack_boundary_arg + < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2) || ix86_incoming_stack_boundary_arg > 12) error ("-mincoming-stack-boundary=%d is not between %d and 12", - ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2); + ix86_incoming_stack_boundary_arg, + TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2); else { ix86_user_incoming_stack_boundary @@ -3750,36 +3767,50 @@ ix86_option_override_internal (bool main_args_p) } /* Accept -msseregparm only if at least SSE support is enabled. */ - if (TARGET_SSEREGPARM - && ! TARGET_SSE) + if (TARGET_SSEREGPARM_P (opts->x_target_flags) + && ! TARGET_SSE_P (opts->x_ix86_isa_flags)) error ("%ssseregparm%s used without SSE enabled", prefix, suffix); - if (global_options_set.x_ix86_fpmath) + if (opts_set->x_ix86_fpmath) { - if (ix86_fpmath & FPMATH_SSE) + if (opts->x_ix86_fpmath & FPMATH_SSE) { - if (!TARGET_SSE) + if (!TARGET_SSE_P (opts->x_ix86_isa_flags)) { warning (0, "SSE instruction set disabled, using 387 arithmetics"); - ix86_fpmath = FPMATH_387; + opts->x_ix86_fpmath = FPMATH_387; } - else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387) + else if ((opts->x_ix86_fpmath & FPMATH_387) + && !TARGET_80387_P (opts->x_target_flags)) { warning (0, "387 instruction set disabled, using SSE arithmetics"); - ix86_fpmath = FPMATH_SSE; + opts->x_ix86_fpmath = FPMATH_SSE; } } } + /* For all chips supporting SSE2, -mfpmath=sse performs better than + fpmath=387. The second is however default at many targets since the + extra 80bit precision of temporaries is considered to be part of ABI. + Overwrite the default at least for -ffast-math. + TODO: -mfpmath=both seems to produce same performing code with bit + smaller binaries. It is however not clear if register allocation is + ready for this setting. + Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE + codegen. We may switch to 387 with -ffast-math for size optimized + functions. */ + else if (fast_math_flags_set_p (&global_options) + && TARGET_SSE2) + ix86_fpmath = FPMATH_SSE; else - ix86_fpmath = TARGET_FPMATH_DEFAULT; + opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags); /* If the i387 is disabled, then do not return values in it. */ - if (!TARGET_80387) - target_flags &= ~MASK_FLOAT_RETURNS; + if (!TARGET_80387_P (opts->x_target_flags)) + opts->x_target_flags &= ~MASK_FLOAT_RETURNS; /* Use external vectorized library in vectorizing intrinsics. */ - if (global_options_set.x_ix86_veclibabi_type) - switch (ix86_veclibabi_type) + if (opts_set->x_ix86_veclibabi_type) + switch (opts->x_ix86_veclibabi_type) { case ix86_veclibabi_type_svml: ix86_veclib_handler = ix86_veclibabi_svml; @@ -3793,39 +3824,21 @@ ix86_option_override_internal (bool main_args_p) gcc_unreachable (); } - ix86_tune_mask = 1u << ix86_tune; - if ((!USE_IX86_FRAME_POINTER - || (x86_accumulate_outgoing_args & ix86_tune_mask)) - && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) - && !optimize_size) - target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - - /* ??? Unwind info is not correct around the CFG unless either a frame - pointer is present or M_A_O_A is set. Fixing this requires rewriting - unwind info generation to be aware of the CFG and propagating states - around edges. */ - if ((flag_unwind_tables || flag_asynchronous_unwind_tables - || flag_exceptions || flag_non_call_exceptions) - && flag_omit_frame_pointer - && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - { - if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) - warning (0, "unwind tables currently require either a frame pointer " - "or %saccumulate-outgoing-args%s for correctness", - prefix, suffix); - target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - } + if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] + && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) + && !opts->x_optimize_size) + opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; /* If stack probes are required, the space used for large function arguments on the stack must also be probed, so enable -maccumulate-outgoing-args so this happens in the prologue. */ - if (TARGET_STACK_PROBE - && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + if (TARGET_STACK_PROBE_P (opts->x_target_flags) + && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) { - if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) warning (0, "stack probing requires %saccumulate-outgoing-args%s " "for correctness", prefix, suffix); - target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ @@ -3840,38 +3853,38 @@ ix86_option_override_internal (bool main_args_p) /* When scheduling description is not available, disable scheduler pass so it won't slow down the compilation and make x87 code slower. */ if (!TARGET_SCHEDULE) - flag_schedule_insns_after_reload = flag_schedule_insns = 0; + opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0; maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, ix86_tune_cost->simultaneous_prefetches, - global_options.x_param_values, - global_options_set.x_param_values); + opts->x_param_values, + opts_set->x_param_values); maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_tune_cost->prefetch_block, - global_options.x_param_values, - global_options_set.x_param_values); + opts->x_param_values, + opts_set->x_param_values); maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_tune_cost->l1_cache_size, - global_options.x_param_values, - global_options_set.x_param_values); + opts->x_param_values, + opts_set->x_param_values); maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_tune_cost->l2_cache_size, - global_options.x_param_values, - global_options_set.x_param_values); + opts->x_param_values, + opts_set->x_param_values); /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ - if (flag_prefetch_loop_arrays < 0 + if (opts->x_flag_prefetch_loop_arrays < 0 && HAVE_prefetch - && (optimize >= 3 || flag_profile_use) + && (opts->x_optimize >= 3 || opts->x_flag_profile_use) && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL) - flag_prefetch_loop_arrays = 1; + opts->x_flag_prefetch_loop_arrays = 1; /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) - can be optimized to ap = __builtin_next_arg (0). */ - if (!TARGET_64BIT && !flag_split_stack) + can be opts->x_optimized to ap = __builtin_next_arg (0). */ + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack) targetm.expand_builtin_va_start = NULL; - if (TARGET_64BIT) + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) { ix86_gen_leave = gen_leave_rex64; if (Pmode == DImode) @@ -3917,56 +3930,56 @@ ix86_option_override_internal (bool main_args_p) #ifdef USE_IX86_CLD /* Use -mcld by default for 32-bit code if configured with --enable-cld. */ - if (!TARGET_64BIT) - target_flags |= MASK_CLD & ~target_flags_explicit; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags; #endif - if (!TARGET_64BIT && flag_pic) + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic) { - if (flag_fentry > 0) + if (opts->x_flag_fentry > 0) sorry ("-mfentry isn%'t supported for 32-bit in combination " "with -fpic"); - flag_fentry = 0; + opts->x_flag_fentry = 0; } else if (TARGET_SEH) { - if (flag_fentry == 0) + if (opts->x_flag_fentry == 0) sorry ("-mno-fentry isn%'t compatible with SEH"); - flag_fentry = 1; + opts->x_flag_fentry = 1; } - else if (flag_fentry < 0) + else if (opts->x_flag_fentry < 0) { #if defined(PROFILE_BEFORE_PROLOGUE) - flag_fentry = 1; + opts->x_flag_fentry = 1; #else - flag_fentry = 0; + opts->x_flag_fentry = 0; #endif } - /* When not optimize for size, enable vzeroupper optimization for + /* When not opts->x_optimize for size, enable vzeroupper optimization for TARGET_AVX with -fexpensive-optimizations and split 32-byte AVX unaligned load/store. */ - if (!optimize_size) + if (!opts->x_optimize_size) { if (flag_expensive_optimizations - && !(target_flags_explicit & MASK_VZEROUPPER)) - target_flags |= MASK_VZEROUPPER; - if ((x86_avx256_split_unaligned_load & ix86_tune_mask) - && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) - target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if ((x86_avx256_split_unaligned_store & ix86_tune_mask) - && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) - target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + && !(opts_set->x_target_flags & MASK_VZEROUPPER)) + opts->x_target_flags |= MASK_VZEROUPPER; + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] + && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] + && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) + opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */ if (TARGET_AVX128_OPTIMAL - && !(target_flags_explicit & MASK_PREFER_AVX128)) - target_flags |= MASK_PREFER_AVX128; + && !(opts_set->x_target_flags & MASK_PREFER_AVX128)) + opts->x_target_flags |= MASK_PREFER_AVX128; } - if (ix86_recip_name) + if (opts->x_ix86_recip_name) { - char *p = ASTRDUP (ix86_recip_name); + char *p = ASTRDUP (opts->x_ix86_recip_name); char *q; unsigned int mask, i; bool invert; @@ -4001,45 +4014,46 @@ ix86_option_override_internal (bool main_args_p) } } - recip_mask_explicit |= mask; + opts->x_recip_mask_explicit |= mask; if (invert) - recip_mask &= ~mask; + opts->x_recip_mask &= ~mask; else - recip_mask |= mask; + opts->x_recip_mask |= mask; } } - if (TARGET_RECIP) - recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit; - else if (target_flags_explicit & MASK_RECIP) - recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit); + if (TARGET_RECIP_P (opts->x_target_flags)) + opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit; + else if (opts_set->x_target_flags & MASK_RECIP) + opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit); /* Default long double to 64-bit for Bionic. */ if (TARGET_HAS_BIONIC - && !(target_flags_explicit & MASK_LONG_DOUBLE_64)) - target_flags |= MASK_LONG_DOUBLE_64; + && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64)) + opts->x_target_flags |= MASK_LONG_DOUBLE_64; /* Save the initial options in case the user does function specific options. */ if (main_args_p) target_option_default_node = target_option_current_node - = build_target_option_node (); + = build_target_option_node (opts); /* Handle stack protector */ - if (!global_options_set.x_ix86_stack_protector_guard) - ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + if (!opts_set->x_ix86_stack_protector_guard) + opts->x_ix86_stack_protector_guard + = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ - if (ix86_tune_memcpy_strategy) + if (opts->x_ix86_tune_memcpy_strategy) { - char *str = xstrdup (ix86_tune_memcpy_strategy); + char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy); ix86_parse_stringop_strategy_string (str, false); free (str); } - if (ix86_tune_memset_strategy) + if (opts->x_ix86_tune_memset_strategy) { - char *str = xstrdup (ix86_tune_memset_strategy); + char *str = xstrdup (opts->x_ix86_tune_memset_strategy); ix86_parse_stringop_strategy_string (str, true); free (str); } @@ -4056,7 +4070,7 @@ ix86_option_override (void) 1, PASS_POS_INSERT_AFTER }; - ix86_option_override_internal (true); + ix86_option_override_internal (true, &global_options, &global_options_set); /* This needs to be done at start up. It's convenient to do it here. */ @@ -4141,7 +4155,8 @@ ix86_conditional_register_usage (void) /* Save the current options */ static void -ix86_function_specific_save (struct cl_target_option *ptr) +ix86_function_specific_save (struct cl_target_option *ptr, + struct gcc_options *opts) { ptr->arch = ix86_arch; ptr->schedule = ix86_schedule; @@ -4149,9 +4164,9 @@ ix86_function_specific_save (struct cl_target_option *ptr) ptr->branch_cost = ix86_branch_cost; ptr->tune_defaulted = ix86_tune_defaulted; ptr->arch_specified = ix86_arch_specified; - ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit; - ptr->ix86_target_flags_explicit = target_flags_explicit; - ptr->x_recip_mask_explicit = recip_mask_explicit; + ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit; + ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit; + ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit; /* The fields are char but the variables are not; make sure the values fit in the fields. */ @@ -4164,7 +4179,8 @@ ix86_function_specific_save (struct cl_target_option *ptr) /* Restore the current options */ static void -ix86_function_specific_restore (struct cl_target_option *ptr) +ix86_function_specific_restore (struct gcc_options *opts, + struct cl_target_option *ptr) { enum processor_type old_tune = ix86_tune; enum processor_type old_arch = ix86_arch; @@ -4174,12 +4190,12 @@ ix86_function_specific_restore (struct cl_target_option *ptr) ix86_arch = (enum processor_type) ptr->arch; ix86_schedule = (enum attr_cpu) ptr->schedule; ix86_tune = (enum processor_type) ptr->tune; - ix86_branch_cost = ptr->branch_cost; + opts->x_ix86_branch_cost = ptr->branch_cost; ix86_tune_defaulted = ptr->tune_defaulted; ix86_arch_specified = ptr->arch_specified; - ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit; - target_flags_explicit = ptr->ix86_target_flags_explicit; - recip_mask_explicit = ptr->x_recip_mask_explicit; + opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit; + opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit; + opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit; /* Recreate the arch feature tests if the arch changed */ if (old_arch != ix86_arch) @@ -4235,6 +4251,8 @@ ix86_function_specific_print (FILE *file, int indent, static bool ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], + struct gcc_options *opts, + struct gcc_options *opts_set, struct gcc_options *enum_opts_set) { char *next_optstr; @@ -4351,7 +4369,8 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], for (; args; args = TREE_CHAIN (args)) if (TREE_VALUE (args) && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args), - p_strings, enum_opts_set)) + p_strings, opts, opts_set, + enum_opts_set)) ret = false; return ret; @@ -4434,7 +4453,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], struct cl_decoded_option decoded; generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded); - ix86_handle_option (&global_options, &global_options_set, + ix86_handle_option (opts, opts_set, &decoded, input_location); } @@ -4444,9 +4463,9 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], opt_set_p = !opt_set_p; if (opt_set_p) - target_flags |= mask; + opts->x_target_flags |= mask; else - target_flags &= ~mask; + opts->x_target_flags &= ~mask; } else if (type == ix86_opt_str) @@ -4467,7 +4486,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); if (arg_ok) - set_option (&global_options, enum_opts_set, opt, value, + set_option (opts, enum_opts_set, opt, value, p + opt_len, DK_UNSPECIFIED, input_location, global_dc); else @@ -4487,11 +4506,13 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */ tree -ix86_valid_target_attribute_tree (tree args) +ix86_valid_target_attribute_tree (tree args, + struct gcc_options *opts, + struct gcc_options *opts_set) { const char *orig_arch_string = ix86_arch_string; const char *orig_tune_string = ix86_tune_string; - enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath; + enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath; int orig_tune_defaulted = ix86_tune_defaulted; int orig_arch_specified = ix86_arch_specified; char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL }; @@ -4504,16 +4525,16 @@ ix86_valid_target_attribute_tree (tree args) memset (&enum_opts_set, 0, sizeof (enum_opts_set)); /* Process each of the options on the chain. */ - if (! ix86_valid_target_attribute_inner_p (args, option_strings, - &enum_opts_set)) + if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts, + opts_set, &enum_opts_set)) return error_mark_node; /* If the changed options are different from the default, rerun ix86_option_override_internal, and then save the options away. The string options are are attribute options, and will be undone when we copy the save structure. */ - if (ix86_isa_flags != def->x_ix86_isa_flags - || target_flags != def->x_target_flags + if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags + || opts->x_target_flags != def->x_target_flags || option_strings[IX86_FUNCTION_SPECIFIC_ARCH] || option_strings[IX86_FUNCTION_SPECIFIC_TUNE] || enum_opts_set.x_ix86_fpmath) @@ -4521,37 +4542,37 @@ ix86_valid_target_attribute_tree (tree args) /* If we are using the default tune= or arch=, undo the string assigned, and use the default. */ if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]) - ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH]; + opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH]; else if (!orig_arch_specified) - ix86_arch_string = NULL; + opts->x_ix86_arch_string = NULL; if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) - ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE]; + opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE]; else if (orig_tune_defaulted) - ix86_tune_string = NULL; + opts->x_ix86_tune_string = NULL; /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ if (enum_opts_set.x_ix86_fpmath) - global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1; + opts_set->x_ix86_fpmath = (enum fpmath_unit) 1; else if (!TARGET_64BIT && TARGET_SSE) { - ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387); - global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1; + opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387); + opts_set->x_ix86_fpmath = (enum fpmath_unit) 1; } /* Do any overrides, such as arch=xxx, or tune=xxx support. */ - ix86_option_override_internal (false); + ix86_option_override_internal (false, opts, opts_set); /* Add any builtin functions with the new isa if any. */ - ix86_add_new_builtins (ix86_isa_flags); + ix86_add_new_builtins (opts->x_ix86_isa_flags); /* Save the current options unless we are validating options for #pragma. */ - t = build_target_option_node (); + t = build_target_option_node (opts); - ix86_arch_string = orig_arch_string; - ix86_tune_string = orig_tune_string; - global_options_set.x_ix86_fpmath = orig_fpmath_set; + opts->x_ix86_arch_string = orig_arch_string; + opts->x_ix86_tune_string = orig_tune_string; + opts_set->x_ix86_fpmath = orig_fpmath_set; /* Free up memory allocated to hold the strings */ for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++) @@ -4569,7 +4590,8 @@ ix86_valid_target_attribute_p (tree fndecl, tree args, int ARG_UNUSED (flags)) { - struct cl_target_option cur_target; + struct gcc_options func_options; + tree new_target, new_optimize; bool ret = true; /* attribute((target("default"))) does nothing, beyond @@ -4580,21 +4602,31 @@ ix86_valid_target_attribute_p (tree fndecl, && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0) return true; - tree old_optimize = build_optimization_node (); - tree new_target, new_optimize; + tree old_optimize = build_optimization_node (&global_options); + + /* Get the optimization options of the current function. */ tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); + + if (!func_optimize) + func_optimize = old_optimize; + + /* Init func_options. */ + memset (&func_options, 0, sizeof (func_options)); + init_options_struct (&func_options, NULL); + lang_hooks.init_options_struct (&func_options); + + cl_optimization_restore (&func_options, + TREE_OPTIMIZATION (func_optimize)); - /* If the function changed the optimization levels as well as setting target - options, start with the optimizations specified. */ - if (func_optimize && func_optimize != old_optimize) - cl_optimization_restore (&global_options, - TREE_OPTIMIZATION (func_optimize)); + /* Initialize func_options to the default before its target options can + be set. */ + cl_target_option_restore (&func_options, + TREE_TARGET_OPTION (target_option_default_node)); - /* The target attributes may also change some optimization flags, so update - the optimization options if necessary. */ - cl_target_option_save (&cur_target, &global_options); - new_target = ix86_valid_target_attribute_tree (args); - new_optimize = build_optimization_node (); + new_target = ix86_valid_target_attribute_tree (args, &func_options, + &global_options_set); + + new_optimize = build_optimization_node (&func_options); if (new_target == error_mark_node) ret = false; @@ -4607,12 +4639,6 @@ ix86_valid_target_attribute_p (tree fndecl, DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; } - cl_target_option_restore (&global_options, &cur_target); - - if (old_optimize != new_optimize) - cl_optimization_restore (&global_options, - TREE_OPTIMIZATION (old_optimize)); - return ret; } @@ -7368,9 +7394,15 @@ ix86_function_value_regno_p (const unsigned int regno) switch (regno) { case AX_REG: + case DX_REG: return true; + case DI_REG: + case SI_REG: + return TARGET_64BIT && ix86_abi != MS_ABI; - case FIRST_FLOAT_REG: + /* Complex values are returned in %st(0)/%st(1) pair. */ + case ST0_REG: + case ST1_REG: /* TODO: The function should depend on current function ABI but builtins.c would need updating then. Therefore we use the default ABI. */ @@ -7378,10 +7410,12 @@ ix86_function_value_regno_p (const unsigned int regno) return false; return TARGET_FLOAT_RETURNS_IN_80387; - case FIRST_SSE_REG: + /* Complex values are returned in %xmm0/%xmm1 pair. */ + case XMM0_REG: + case XMM1_REG: return TARGET_SSE; - case FIRST_MMX_REG: + case MM0_REG: if (TARGET_MACHO || TARGET_64BIT) return false; return TARGET_MMX; @@ -11512,8 +11546,8 @@ ix86_expand_split_stack_prologue (void) JUMP_LABEL (jump_insn) = label; /* Mark the jump as very likely to be taken. */ - add_reg_note (jump_insn, REG_BR_PROB, - GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100)); + add_int_reg_note (jump_insn, REG_BR_PROB, + REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100); if (split_stack_fn == NULL_RTX) split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); @@ -14809,7 +14843,7 @@ ix86_print_operand (FILE *file, rtx x, int code) x = find_reg_note (current_output_insn, REG_BR_PROB, 0); if (x) { - int pred_val = INTVAL (XEXP (x, 0)); + int pred_val = XINT (x, 0); if (pred_val < REG_BR_PROB_BASE * 45 / 100 || pred_val > REG_BR_PROB_BASE * 55 / 100) @@ -16458,8 +16492,8 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) gcc_unreachable (); case V32QImode: extract = gen_avx_vextractf128v32qi; - load_unaligned = gen_avx_loaddqu256; - store_unaligned = gen_avx_storedqu256; + load_unaligned = gen_avx_loaddquv32qi; + store_unaligned = gen_avx_storedquv32qi; mode = V16QImode; break; case V8SFmode: @@ -16562,10 +16596,56 @@ void ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { rtx op0, op1, m; + rtx (*load_unaligned) (rtx, rtx); + rtx (*store_unaligned) (rtx, rtx); op0 = operands[0]; op1 = operands[1]; + if (GET_MODE_SIZE (mode) == 64) + { + switch (GET_MODE_CLASS (mode)) + { + case MODE_VECTOR_INT: + case MODE_INT: + op0 = gen_lowpart (V16SImode, op0); + op1 = gen_lowpart (V16SImode, op1); + /* FALLTHRU */ + + case MODE_VECTOR_FLOAT: + switch (GET_MODE (op0)) + { + default: + gcc_unreachable (); + case V16SImode: + load_unaligned = gen_avx512f_loaddquv16si; + store_unaligned = gen_avx512f_storedquv16si; + break; + case V16SFmode: + load_unaligned = gen_avx512f_loadups512; + store_unaligned = gen_avx512f_storeups512; + break; + case V8DFmode: + load_unaligned = gen_avx512f_loadupd512; + store_unaligned = gen_avx512f_storeupd512; + break; + } + + if (MEM_P (op1)) + emit_insn (load_unaligned (op0, op1)); + else if (MEM_P (op0)) + emit_insn (store_unaligned (op0, op1)); + else + gcc_unreachable (); + break; + + default: + gcc_unreachable (); + } + + return; + } + if (TARGET_AVX && GET_MODE_SIZE (mode) == 32) { @@ -16598,7 +16678,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_sse2_loaddqu (op0, op1)); + emit_insn (gen_sse2_loaddquv16qi (op0, op1)); } else if (TARGET_SSE2 && mode == V2DFmode) { @@ -16673,7 +16753,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_sse2_storedqu (op0, op1)); + emit_insn (gen_sse2_storedquv16qi (op0, op1)); } else if (TARGET_SSE2 && mode == V2DFmode) { @@ -16812,8 +16892,10 @@ ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode, src2 = force_reg (mode, src2); src1 = src2; } - else + else if (rtx_equal_p (dst, src1)) src2 = force_reg (mode, src2); + else + src1 = force_reg (mode, src1); } /* If the destination is memory, and we do not have matching source @@ -19455,7 +19537,7 @@ ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2, gen_rtx_IF_THEN_ELSE (VOIDmode, condition, target1, target2))); if (split_branch_probability >= 0) - add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability)); + add_int_reg_note (i, REG_BR_PROB, split_branch_probability); } void @@ -21962,7 +22044,7 @@ predict_jump (int prob) { rtx insn = get_last_insn (); gcc_assert (JUMP_P (insn)); - add_reg_note (insn, REG_BR_PROB, GEN_INT (prob)); + add_int_reg_note (insn, REG_BR_PROB, prob); } /* Helper function for the string operations below. Dest VARIABLE whether @@ -22035,6 +22117,21 @@ counter_mode (rtx count_exp) return SImode; } +/* Copy the address to a Pmode register. This is used for x32 to + truncate DImode TLS address to a SImode register. */ + +static rtx +ix86_copy_addr_to_reg (rtx addr) +{ + if (GET_MODE (addr) == Pmode) + return copy_addr_to_reg (addr); + else + { + gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode); + return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0); + } +} + /* When SRCPTR is non-NULL, output simple loop to move memory pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT specified in bytes. When SRCPTR is NULL, output the @@ -22991,8 +23088,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (!count) count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); - destreg = copy_addr_to_reg (XEXP (dst, 0)); - srcreg = copy_addr_to_reg (XEXP (src, 0)); + destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); + srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); unroll_factor = 1; move_mode = word_mode; @@ -23395,7 +23492,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, if (!count) count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp); - destreg = copy_addr_to_reg (XEXP (dst, 0)); + destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); move_mode = word_mode; unroll_factor = 1; @@ -24390,17 +24487,14 @@ ix86_issue_rate (void) case PROCESSOR_SLM: case PROCESSOR_K6: case PROCESSOR_BTVER2: + case PROCESSOR_PENTIUM4: + case PROCESSOR_NOCONA: return 2; case PROCESSOR_PENTIUMPRO: - case PROCESSOR_PENTIUM4: - case PROCESSOR_CORE2: - case PROCESSOR_COREI7: - case PROCESSOR_HASWELL: case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: - case PROCESSOR_NOCONA: case PROCESSOR_GENERIC: case PROCESSOR_BDVER1: case PROCESSOR_BDVER2: @@ -24408,6 +24502,11 @@ ix86_issue_rate (void) case PROCESSOR_BTVER1: return 3; + case PROCESSOR_CORE2: + case PROCESSOR_COREI7: + case PROCESSOR_HASWELL: + return 4; + default: return 1; } @@ -24664,10 +24763,15 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_BDVER3: case PROCESSOR_BTVER1: case PROCESSOR_BTVER2: - case PROCESSOR_ATOM: case PROCESSOR_GENERIC: memory = get_attr_memory (insn); + /* Stack engine allows to execute push&pop instructions in parall. */ + if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8)) + return 0; + /* Show ability of reorder buffer to hide latency of load by executing in parallel with previous instruction in case previous instruction is not needed to compute the address. */ @@ -24694,6 +24798,29 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) } break; + case PROCESSOR_CORE2: + case PROCESSOR_COREI7: + case PROCESSOR_HASWELL: + memory = get_attr_memory (insn); + + /* Stack engine allows to execute push&pop instructions in parall. */ + if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + return 0; + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + if (cost >= 4) + cost -= 4; + else + cost = 0; + } + break; + case PROCESSOR_SLM: if (!reload_completed) return cost; @@ -27400,13 +27527,13 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, @@ -27435,8 +27562,8 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI }, @@ -27705,7 +27832,7 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF }, { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF }, { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF }, @@ -28053,7 +28180,7 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI }, { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF }, @@ -29222,7 +29349,8 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) if (strstr (attrs_str, "arch=") != NULL) { cl_target_option_save (&cur_target, &global_options); - target_node = ix86_valid_target_attribute_tree (attrs); + target_node = ix86_valid_target_attribute_tree (attrs, &global_options, + &global_options_set); gcc_assert (target_node); new_target = TREE_TARGET_OPTION (target_node); @@ -34739,7 +34867,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, rtx sub; gcc_assert (FLOAT_MODE_P (mode)); - gcc_assert (TARGET_FMA || TARGET_FMA4); + gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); /* ??? SSE scalar/vector cost should be used here. */ /* ??? Bald assumption that fma has the same cost as fmul. */ @@ -38217,6 +38345,7 @@ static rtx ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, bool swap_operands) { + enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code); rtx label, tmp; if (swap_operands) @@ -38227,9 +38356,9 @@ ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, } label = gen_label_rtx (); - tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG); + tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG); emit_insn (gen_rtx_SET (VOIDmode, tmp, - gen_rtx_COMPARE (CCFPUmode, op0, op1))); + gen_rtx_COMPARE (fpcmp_mode, op0, op1))); tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx); tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); @@ -42736,20 +42865,17 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, unsigned *cost = (unsigned *) data; unsigned retval = 0; - if (flag_vect_cost_model) - { - tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; - int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; + int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); - /* Statements in an inner loop relative to the loop being - vectorized are weighted more heavily. The value here is - arbitrary and could potentially be improved with analysis. */ - if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) - count *= 50; /* FIXME. */ + /* Statements in an inner loop relative to the loop being + vectorized are weighted more heavily. The value here is + arbitrary and could potentially be improved with analysis. */ + if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) + count *= 50; /* FIXME. */ - retval = (unsigned) (count * stmt_cost); - cost[where] += retval; - } + retval = (unsigned) (count * stmt_cost); + cost[where] += retval; return retval; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 440844e7735..63e49032748 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -40,53 +40,99 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Redefines for option macros. */ #define TARGET_64BIT TARGET_ISA_64BIT +#define TARGET_64BIT_P(x) TARGET_ISA_64BIT_P(x) #define TARGET_MMX TARGET_ISA_MMX +#define TARGET_MMX_P(x) TARGET_ISA_MMX_P(x) #define TARGET_3DNOW TARGET_ISA_3DNOW +#define TARGET_3DNOW_P(x) TARGET_ISA_3DNOW_P(x) #define TARGET_3DNOW_A TARGET_ISA_3DNOW_A +#define TARGET_3DNOW_A_P(x) TARGET_ISA_3DNOW_A_P(x) #define TARGET_SSE TARGET_ISA_SSE +#define TARGET_SSE_P(x) TARGET_ISA_SSE_P(x) #define TARGET_SSE2 TARGET_ISA_SSE2 +#define TARGET_SSE2_P(x) TARGET_ISA_SSE2_P(x) #define TARGET_SSE3 TARGET_ISA_SSE3 +#define TARGET_SSE3_P(x) TARGET_ISA_SSE3_P(x) #define TARGET_SSSE3 TARGET_ISA_SSSE3 +#define TARGET_SSSE3_P(x) TARGET_ISA_SSSE3_P(x) #define TARGET_SSE4_1 TARGET_ISA_SSE4_1 +#define TARGET_SSE4_1_P(x) TARGET_ISA_SSE4_1_P(x) #define TARGET_SSE4_2 TARGET_ISA_SSE4_2 +#define TARGET_SSE4_2_P(x) TARGET_ISA_SSE4_2_P(x) #define TARGET_AVX TARGET_ISA_AVX +#define TARGET_AVX_P(x) TARGET_ISA_AVX_P(x) #define TARGET_AVX2 TARGET_ISA_AVX2 +#define TARGET_AVX2_P(x) TARGET_ISA_AVX2_P(x) #define TARGET_AVX512F TARGET_ISA_AVX512F +#define TARGET_AVX512F_P(x) TARGET_ISA_AVX512F_P(x) #define TARGET_AVX512PF TARGET_ISA_AVX512PF +#define TARGET_AVX512PF_P(x) TARGET_ISA_AVX512PF_P(x) #define TARGET_AVX512ER TARGET_ISA_AVX512ER +#define TARGET_AVX512ER_P(x) TARGET_ISA_AVX512ER_P(x) #define TARGET_AVX512CD TARGET_ISA_AVX512CD +#define TARGET_AVX512CD_P(x) TARGET_ISA_AVX512CD_P(x) #define TARGET_FMA TARGET_ISA_FMA +#define TARGET_FMA_P(x) TARGET_ISA_FMA_P(x) #define TARGET_SSE4A TARGET_ISA_SSE4A +#define TARGET_SSE4A_P(x) TARGET_ISA_SSE4A_P(x) #define TARGET_FMA4 TARGET_ISA_FMA4 +#define TARGET_FMA4_P(x) TARGET_ISA_FMA4_P(x) #define TARGET_XOP TARGET_ISA_XOP +#define TARGET_XOP_P(x) TARGET_ISA_XOP_P(x) #define TARGET_LWP TARGET_ISA_LWP +#define TARGET_LWP_P(x) TARGET_ISA_LWP_P(x) #define TARGET_ROUND TARGET_ISA_ROUND #define TARGET_ABM TARGET_ISA_ABM +#define TARGET_ABM_P(x) TARGET_ISA_ABM_P(x) #define TARGET_BMI TARGET_ISA_BMI +#define TARGET_BMI_P(x) TARGET_ISA_BMI_P(x) #define TARGET_BMI2 TARGET_ISA_BMI2 +#define TARGET_BMI2_P(x) TARGET_ISA_BMI2_P(x) #define TARGET_LZCNT TARGET_ISA_LZCNT +#define TARGET_LZCNT_P(x) TARGET_ISA_LZCNT_P(x) #define TARGET_TBM TARGET_ISA_TBM +#define TARGET_TBM_P(x) TARGET_ISA_TBM_P(x) #define TARGET_POPCNT TARGET_ISA_POPCNT +#define TARGET_POPCNT_P(x) TARGET_ISA_POPCNT_P(x) #define TARGET_SAHF TARGET_ISA_SAHF +#define TARGET_SAHF_P(x) TARGET_ISA_SAHF_P(x) #define TARGET_MOVBE TARGET_ISA_MOVBE +#define TARGET_MOVBE_P(x) TARGET_ISA_MOVBE_P(x) #define TARGET_CRC32 TARGET_ISA_CRC32 +#define TARGET_CRC32_P(x) TARGET_ISA_CRC32_P(x) #define TARGET_AES TARGET_ISA_AES +#define TARGET_AES_P(x) TARGET_ISA_AES_P(x) #define TARGET_PCLMUL TARGET_ISA_PCLMUL +#define TARGET_PCLMUL_P(x) TARGET_ISA_PCLMUL_P(x) #define TARGET_CMPXCHG16B TARGET_ISA_CX16 +#define TARGET_CMPXCHG16B_P(x) TARGET_ISA_CX16_P(x) #define TARGET_FSGSBASE TARGET_ISA_FSGSBASE +#define TARGET_FSGSBASE_P(x) TARGET_ISA_FSGSBASE_P(x) #define TARGET_RDRND TARGET_ISA_RDRND +#define TARGET_RDRND_P(x) TARGET_ISA_RDRND_P(x) #define TARGET_F16C TARGET_ISA_F16C +#define TARGET_F16C_P(x) TARGET_ISA_F16C_P(x) #define TARGET_RTM TARGET_ISA_RTM +#define TARGET_RTM_P(x) TARGET_ISA_RTM_P(x) #define TARGET_HLE TARGET_ISA_HLE +#define TARGET_HLE_P(x) TARGET_ISA_HLE_P(x) #define TARGET_RDSEED TARGET_ISA_RDSEED +#define TARGET_RDSEED_P(x) TARGET_ISA_RDSEED_P(x) #define TARGET_PRFCHW TARGET_ISA_PRFCHW +#define TARGET_PRFCHW_P(x) TARGET_ISA_PRFCHW_P(x) #define TARGET_ADX TARGET_ISA_ADX +#define TARGET_ADX_P(x) TARGET_ISA_ADX_P(x) #define TARGET_FXSR TARGET_ISA_FXSR +#define TARGET_FXSR_P(x) TARGET_ISA_FXSR_P(x) #define TARGET_XSAVE TARGET_ISA_XSAVE +#define TARGET_XSAVE_P(x) TARGET_ISA_XSAVE_P(x) #define TARGET_XSAVEOPT TARGET_ISA_XSAVEOPT +#define TARGET_XSAVEOPT_P(x) TARGET_ISA_XSAVEOPT_P(x) #define TARGET_LP64 TARGET_ABI_64 +#define TARGET_LP64_P(x) TARGET_ABI_64_P(x) #define TARGET_X32 TARGET_ABI_X32 +#define TARGET_X32_P(x) TARGET_ABI_X32_P(x) /* SSE4.1 defines round instructions */ #define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1 @@ -212,7 +258,13 @@ extern const struct processor_costs ix86_size_cost; (TARGET_64BIT && TARGET_SSE ? FPMATH_SSE : FPMATH_387) #endif +#ifndef TARGET_FPMATH_DEFAULT_P +#define TARGET_FPMATH_DEFAULT_P(x) \ + (TARGET_64BIT_P(x) && TARGET_SSE_P(x) ? FPMATH_SSE : FPMATH_387) +#endif + #define TARGET_FLOAT_RETURNS_IN_80387 TARGET_FLOAT_RETURNS +#define TARGET_FLOAT_RETURNS_IN_80387_P(x) TARGET_FLOAT_RETURNS_P(x) /* 64bit Sledgehammer mode. For libgcc2 we make sure this is a compile-time constant. */ @@ -1492,13 +1544,26 @@ enum reg_class will be computed and placed into the variable `crtl->outgoing_args_size'. No space will be pushed onto the stack for each call; instead, the function prologue should increase the stack frame size by this amount. + + In 32bit mode enabling argument accumulation results in about 5% code size + growth becuase move instructions are less compact than push. In 64bit + mode the difference is less drastic but visible. + + FIXME: Unlike earlier implementations, the size of unwind info seems to + actually grouw with accumulation. Is that because accumulated args + unwind info became unnecesarily bloated? 64-bit MS ABI seem to require 16 byte alignment everywhere except for - function prologue and apilogue. This is not possible without - ACCUMULATE_OUTGOING_ARGS. */ + function prologue and epilogue. This is not possible without + ACCUMULATE_OUTGOING_ARGS. + + If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ #define ACCUMULATE_OUTGOING_ARGS \ - (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI) + ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \ + || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI) /* If defined, a C expression whose value is nonzero when we want to use PUSH instructions to pass outgoing arguments. */ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e009bc96fc2..ad79589c0c7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -746,6 +746,8 @@ (define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus]) +(define_code_iterator multdiv [mult div]) + ;; Base name for define_insn (define_code_attr plusminus_insn [(plus "add") (ss_plus "ssadd") (us_plus "usadd") @@ -757,6 +759,8 @@ (minus "sub") (ss_minus "subs") (us_minus "subus")]) (define_code_attr plusminus_carry_mnemonic [(plus "adc") (minus "sbb")]) +(define_code_attr multdiv_mnemonic + [(mult "mul") (div "div")]) ;; Mark commutative operators as such in constraints. (define_code_attr comm [(plus "%") (ss_plus "%") (us_plus "%") @@ -779,6 +783,7 @@ ;; Mapping of logic operators (define_code_iterator any_logic [and ior xor]) (define_code_iterator any_or [ior xor]) +(define_code_iterator fpint_logic [and xor]) ;; Base name for insn mnemonic. (define_code_attr logic [(and "and") (ior "or") (xor "xor")]) @@ -826,6 +831,10 @@ (define_code_attr s [(sign_extend "s") (zero_extend "u")]) (define_code_attr u_bool [(sign_extend "false") (zero_extend "true")]) +;; Used in signed and unsigned fix. +(define_code_iterator any_fix [fix unsigned_fix]) +(define_code_attr fixsuffix [(fix "") (unsigned_fix "u")]) + ;; All integer modes. (define_mode_iterator SWI1248x [QI HI SI DI]) @@ -5116,6 +5125,61 @@ emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode)); }) +;; Break partial reg stall for cvtsd2ss. + +(define_peephole2 + [(set (match_operand:SF 0 "register_operand") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun) + && SSE_REG_P (operands[0]) + && (!SSE_REG_P (operands[1]) + || REGNO (operands[0]) != REGNO (operands[1]))" + [(set (match_dup 0) + (vec_merge:V4SF + (vec_duplicate:V4SF + (float_truncate:V2SF + (match_dup 1))) + (match_dup 0) + (const_int 1)))] +{ + operands[0] = simplify_gen_subreg (V4SFmode, operands[0], + SFmode, 0); + operands[1] = simplify_gen_subreg (V2DFmode, operands[1], + DFmode, 0); + emit_move_insn (operands[0], CONST0_RTX (V4SFmode)); +}) + +;; Break partial reg stall for cvtss2sd. + +(define_peephole2 + [(set (match_operand:DF 0 "register_operand") + (float_extend:DF + (match_operand:SF 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun) + && SSE_REG_P (operands[0]) + && (!SSE_REG_P (operands[1]) + || REGNO (operands[0]) != REGNO (operands[1]))" + [(set (match_dup 0) + (vec_merge:V2DF + (float_extend:V2DF + (vec_select:V2SF + (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))) + (match_dup 0) + (const_int 1)))] +{ + operands[0] = simplify_gen_subreg (V2DFmode, operands[0], + DFmode, 0); + operands[1] = simplify_gen_subreg (V4SFmode, operands[1], + SFmode, 0); + emit_move_insn (operands[0], CONST0_RTX (V2DFmode)); +}) + ;; Avoid store forwarding (partial memory) stall penalty ;; by passing DImode value through XMM registers. */ diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 5495c295f57..1e624ab0222 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -61,9 +61,13 @@ TargetSave HOST_WIDE_INT x_ix86_isa_flags_explicit ;; which flags were passed by the user -TargetSave +Variable int ix86_target_flags_explicit +;; which flags were passed by the user +TargetSave +HOST_WIDE_INT x_ix86_target_flags_explicit + ;; whether -mtune was not specified TargetSave unsigned char tune_defaulted diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 18f425c4b87..06b291445a9 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -757,11 +757,21 @@ (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 8, 11)"))) +;; Match 8 to 15. +(define_predicate "const_8_to_15_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 8, 15)"))) + ;; Match 12 to 15. (define_predicate "const_12_to_15_operand" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 12, 15)"))) +;; Match 16 to 31. +(define_predicate "const_16_to_31_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 16, 31)"))) + ;; True if this is a constant appropriate for an increment or decrement. (define_predicate "incdec_operand" (match_code "const_int") @@ -1332,3 +1342,9 @@ (define_predicate "general_vector_operand" (ior (match_operand 0 "nonimmediate_operand") (match_code "const_vector"))) + +;; Return true if OP is either -1 constant or stored in register. +(define_predicate "register_or_constm1_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "const_int") + (match_test "op == constm1_rtx")))) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9d9469e2c62..0c52e884eb7 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -83,6 +83,34 @@ UNSPEC_VPERMTI UNSPEC_GATHER UNSPEC_VSIBADDR + + ;; For AVX512F support + UNSPEC_VPERMI2 + UNSPEC_VPERMT2 + UNSPEC_UNSIGNED_PCMP + UNSPEC_TESTM + UNSPEC_TESTNM + UNSPEC_SCATTER + UNSPEC_RCP14 + UNSPEC_RSQRT14 + UNSPEC_FIXUPIMM + UNSPEC_SCALEF + UNSPEC_VTERNLOG + UNSPEC_GETEXP + UNSPEC_GETMANT + UNSPEC_ALIGN + UNSPEC_CONFLICT + UNSPEC_MASKED_EQ + UNSPEC_MASKED_GT + + ;; For AVX512PF support + UNSPEC_GATHER_PREFETCH + UNSPEC_SCATTER_PREFETCH + + ;; For AVX512ER support + UNSPEC_EXP2 + UNSPEC_RCP28 + UNSPEC_RSQRT28 ]) (define_c_enum "unspecv" [ @@ -97,22 +125,22 @@ ;; All vector modes including V?TImode, used in move patterns. (define_mode_iterator VMOVE - [(V32QI "TARGET_AVX") V16QI - (V16HI "TARGET_AVX") V8HI - (V8SI "TARGET_AVX") V4SI - (V4DI "TARGET_AVX") V2DI + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI (V2TI "TARGET_AVX") V1TI - (V8SF "TARGET_AVX") V4SF - (V4DF "TARGET_AVX") V2DF]) + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) ;; All vector modes (define_mode_iterator V [(V32QI "TARGET_AVX") V16QI (V16HI "TARGET_AVX") V8HI - (V8SI "TARGET_AVX") V4SI - (V4DI "TARGET_AVX") V2DI - (V8SF "TARGET_AVX") V4SF - (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) ;; All 128bit vector modes (define_mode_iterator V_128 @@ -122,19 +150,44 @@ (define_mode_iterator V_256 [V32QI V16HI V8SI V4DI V8SF V4DF]) +;; All 512bit vector modes +(define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF]) + +;; All 256bit and 512bit vector modes +(define_mode_iterator V_256_512 + [V32QI V16HI V8SI V4DI V8SF V4DF + (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F") + (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + ;; All vector float modes (define_mode_iterator VF + [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + +;; 128- and 256-bit float vector modes +(define_mode_iterator VF_128_256 [(V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) ;; All SFmode vector float modes (define_mode_iterator VF1 + [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF]) + +;; 128- and 256-bit SF vector modes +(define_mode_iterator VF1_128_256 [(V8SF "TARGET_AVX") V4SF]) ;; All DFmode vector float modes (define_mode_iterator VF2 + [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) + +;; 128- and 256-bit DF vector modes +(define_mode_iterator VF2_128_256 [(V4DF "TARGET_AVX") V2DF]) +(define_mode_iterator VF2_512_256 + [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX")]) + ;; All 128bit vector float modes (define_mode_iterator VF_128 [V4SF (V2DF "TARGET_SSE2")]) @@ -143,9 +196,14 @@ (define_mode_iterator VF_256 [V8SF V4DF]) +;; All 512bit vector float modes +(define_mode_iterator VF_512 + [V16SF V8DF]) + ;; All vector integer modes (define_mode_iterator VI - [(V32QI "TARGET_AVX") V16QI + [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32QI "TARGET_AVX") V16QI (V16HI "TARGET_AVX") V8HI (V8SI "TARGET_AVX") V4SI (V4DI "TARGET_AVX") V2DI]) @@ -153,16 +211,20 @@ (define_mode_iterator VI_AVX2 [(V32QI "TARGET_AVX2") V16QI (V16HI "TARGET_AVX2") V8HI - (V8SI "TARGET_AVX2") V4SI - (V4DI "TARGET_AVX2") V2DI]) + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) ;; All QImode vector integer modes (define_mode_iterator VI1 [(V32QI "TARGET_AVX") V16QI]) +(define_mode_iterator VI_UNALIGNED_LOADSTORE + [(V32QI "TARGET_AVX") V16QI + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + ;; All DImode vector integer modes (define_mode_iterator VI8 - [(V4DI "TARGET_AVX") V2DI]) + [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI]) (define_mode_iterator VI1_AVX2 [(V32QI "TARGET_AVX2") V16QI]) @@ -170,12 +232,36 @@ (define_mode_iterator VI2_AVX2 [(V16HI "TARGET_AVX2") V8HI]) +(define_mode_iterator VI2_AVX512F + [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI]) + +(define_mode_iterator VI4_AVX + [(V8SI "TARGET_AVX") V4SI]) + (define_mode_iterator VI4_AVX2 [(V8SI "TARGET_AVX2") V4SI]) +(define_mode_iterator VI4_AVX512F + [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI]) + +(define_mode_iterator VI48_AVX512F + [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI + (V8DI "TARGET_AVX512F")]) + (define_mode_iterator VI8_AVX2 [(V4DI "TARGET_AVX2") V2DI]) +(define_mode_iterator VI8_AVX2_AVX512F + [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) + +;; All V8D* modes +(define_mode_iterator V8FI + [V8DF V8DI]) + +;; All V16S* modes +(define_mode_iterator V16FI + [V16SF V16SI]) + ;; ??? We should probably use TImode instead. (define_mode_iterator VIMAX_AVX2 [(V2TI "TARGET_AVX2") V1TI]) @@ -192,6 +278,17 @@ [(V16HI "TARGET_AVX2") V8HI (V8SI "TARGET_AVX2") V4SI]) +(define_mode_iterator VI124_AVX2_48_AVX512F + [(V32QI "TARGET_AVX2") V16QI + (V16HI "TARGET_AVX2") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI + (V8DI "TARGET_AVX512F")]) + +(define_mode_iterator VI124_AVX512F + [(V32QI "TARGET_AVX2") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI]) + (define_mode_iterator VI124_AVX2 [(V32QI "TARGET_AVX2") V16QI (V16HI "TARGET_AVX2") V8HI @@ -202,9 +299,14 @@ (V8SI "TARGET_AVX2") V4SI (V4DI "TARGET_AVX2") V2DI]) -(define_mode_iterator VI48_AVX2 - [(V8SI "TARGET_AVX2") V4SI - (V4DI "TARGET_AVX2") V2DI]) +(define_mode_iterator VI248_AVX2_8_AVX512F + [(V16HI "TARGET_AVX2") V8HI + (V8SI "TARGET_AVX2") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) + +(define_mode_iterator VI48_AVX2_48_AVX512F + [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) (define_mode_iterator V48_AVX2 [V4SF V2DF @@ -212,11 +314,18 @@ (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")]) +(define_mode_attr sse2_avx_avx512f + [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f") + (V4SI "sse2") (V8SI "avx") (V16SI "avx512f") + (V8DI "avx512f") + (V16SF "avx512f") (V8SF "avx") (V4SF "avx") + (V8DF "avx512f") (V4DF "avx") (V2DF "avx")]) + (define_mode_attr sse2_avx2 [(V16QI "sse2") (V32QI "avx2") (V8HI "sse2") (V16HI "avx2") - (V4SI "sse2") (V8SI "avx2") - (V2DI "sse2") (V4DI "avx2") + (V4SI "sse2") (V8SI "avx2") (V16SI "avx512f") + (V2DI "sse2") (V4DI "avx2") (V8DI "avx512f") (V1TI "sse2") (V2TI "avx2")]) (define_mode_attr ssse3_avx2 @@ -229,7 +338,7 @@ (define_mode_attr sse4_1_avx2 [(V16QI "sse4_1") (V32QI "avx2") (V8HI "sse4_1") (V16HI "avx2") - (V4SI "sse4_1") (V8SI "avx2") + (V4SI "sse4_1") (V8SI "avx2") (V16SI "avx512f") (V2DI "sse4_1") (V4DI "avx2")]) (define_mode_attr avx_avx2 @@ -244,6 +353,12 @@ (V4SI "vec") (V8SI "avx2") (V2DI "vec") (V4DI "avx2")]) +(define_mode_attr avx2_avx512f + [(V4SI "avx2") (V8SI "avx2") (V16SI "avx512f") + (V2DI "avx2") (V4DI "avx2") (V8DI "avx512f") + (V8SF "avx2") (V16SF "avx512f") + (V4DF "avx2") (V8DF "avx512f")]) + (define_mode_attr shuffletype [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i") (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i") @@ -251,8 +366,12 @@ (V32QI "i") (V16HI "u") (V16QI "i") (V8HI "i") (V64QI "i") (V1TI "i") (V2TI "i")]) +(define_mode_attr ssequartermode + [(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")]) + (define_mode_attr ssedoublemode - [(V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI") + [(V16SF "V32SF") (V16SI "V32SI") (V8DI "V16DI") (V8DF "V16DF") + (V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI") (V32QI "V32HI") (V16QI "V16HI")]) (define_mode_attr ssebytemode @@ -264,7 +383,10 @@ ;; All 256bit vector integer modes (define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI]) -;; Random 128bit vector integer mode combinations +;; All 512bit vector integer modes +(define_mode_iterator VI_512 [V64QI V32HI V16SI V8DI]) + +;; Various 128bit vector integer mode combinations (define_mode_iterator VI12_128 [V16QI V8HI]) (define_mode_iterator VI14_128 [V16QI V4SI]) (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) @@ -273,36 +395,49 @@ (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) (define_mode_iterator VI48_128 [V4SI V2DI]) -;; Random 256bit vector integer mode combinations -(define_mode_iterator VI124_256 [V32QI V16HI V8SI]) +;; Various 256bit and 512 vector integer mode combinations +(define_mode_iterator VI124_256_48_512 + [V32QI V16HI V8SI (V8DI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")]) (define_mode_iterator VI48_256 [V8SI V4DI]) +(define_mode_iterator VI48_512 [V16SI V8DI]) ;; Int-float size matches (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) (define_mode_iterator VI8F_256 [V4DI V4DF]) +(define_mode_iterator VI8F_256_512 + [V4DI V4DF (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) +(define_mode_iterator VI48F_256_512 + [V8SI V8SF + (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) +(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF]) ;; Mapping from float mode to required SSE level (define_mode_attr sse [(SF "sse") (DF "sse2") (V4SF "sse") (V2DF "sse2") - (V8SF "avx") (V4DF "avx")]) + (V16SF "avx512f") (V8SF "avx") + (V8DF "avx512f") (V4DF "avx")]) (define_mode_attr sse2 - [(V16QI "sse2") (V32QI "avx") - (V2DI "sse2") (V4DI "avx")]) + [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f") + (V2DI "sse2") (V4DI "avx") (V8DI "avx512f")]) (define_mode_attr sse3 [(V16QI "sse3") (V32QI "avx")]) (define_mode_attr sse4_1 [(V4SF "sse4_1") (V2DF "sse4_1") - (V8SF "avx") (V4DF "avx")]) + (V8SF "avx") (V4DF "avx") + (V8DF "avx512f")]) (define_mode_attr avxsizesuffix - [(V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256") + [(V64QI "512") (V32HI "512") (V16SI "512") (V8DI "512") + (V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256") (V16QI "") (V8HI "") (V4SI "") (V2DI "") + (V16SF "512") (V8DF "512") (V8SF "256") (V4DF "256") (V4SF "") (V2DF "")]) @@ -316,17 +451,29 @@ (V4SF "V4SF") (V2DF "V2DF") (TI "TI")]) +;; Mapping of vector modes to corresponding mask size +(define_mode_attr avx512fmaskmode + [(V16QI "HI") + (V16HI "HI") (V8HI "QI") + (V16SI "HI") (V8SI "QI") (V4SI "QI") + (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V16SF "HI") (V8SF "QI") (V4SF "QI") + (V8DF "QI") (V4DF "QI") (V2DF "QI")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode - [(V8SF "V8SI") (V4DF "V4DI") - (V4SF "V4SI") (V2DF "V2DI") - (V8SI "V8SI") (V4DI "V4DI") - (V4SI "V4SI") (V2DI "V2DI") - (V16HI "V16HI") (V8HI "V8HI") + [(V16SF "V16SI") (V8DF "V8DI") + (V8SF "V8SI") (V4DF "V4DI") + (V4SF "V4SI") (V2DF "V2DI") + (V16SI "V16SI") (V8DI "V8DI") + (V8SI "V8SI") (V4DI "V4DI") + (V4SI "V4SI") (V2DI "V2DI") + (V16HI "V16HI") (V8HI "V8HI") (V32QI "V32QI") (V16QI "V16QI")]) (define_mode_attr sseintvecmodelower - [(V8SF "v8si") (V4DF "v4di") + [(V16SF "v16si") + (V8SF "v8si") (V4DF "v4di") (V4SF "v4si") (V2DF "v2di") (V8SI "v8si") (V4DI "v4di") (V4SI "v4si") (V2DI "v2di") @@ -342,15 +489,19 @@ ;; Mapping of vector modes to a vector mode of half size (define_mode_attr ssehalfvecmode - [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") - (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") - (V8SF "V4SF") (V4DF "V2DF") - (V4SF "V2SF")]) + [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") + (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") + (V16SF "V8SF") (V8DF "V4DF") + (V8SF "V4SF") (V4DF "V2DF") + (V4SF "V2SF")]) ;; Mapping of vector modes ti packed single mode of the same size (define_mode_attr ssePSmode - [(V32QI "V8SF") (V16QI "V4SF") - (V16HI "V8SF") (V8HI "V4SF") + [(V16SI "V16SF") (V8DF "V16SF") + (V16SF "V16SF") (V8DI "V16SF") + (V64QI "V16SF") (V32QI "V8SF") (V16QI "V4SF") + (V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF") (V8SI "V8SF") (V4SI "V4SF") (V4DI "V8SF") (V2DI "V4SF") (V2TI "V8SF") (V1TI "V4SF") @@ -359,10 +510,21 @@ ;; Mapping of vector modes back to the scalar modes (define_mode_attr ssescalarmode - [(V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") - (V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") - (V8SF "SF") (V4DF "DF") - (V4SF "SF") (V2DF "DF")]) + [(V64QI "QI") (V32QI "QI") (V16QI "QI") + (V32HI "HI") (V16HI "HI") (V8HI "HI") + (V16SI "SI") (V8SI "SI") (V4SI "SI") + (V8DI "DI") (V4DI "DI") (V2DI "DI") + (V16SF "SF") (V8SF "SF") (V4SF "SF") + (V8DF "DF") (V4DF "DF") (V2DF "DF")]) + +;; Mapping of vector modes to the 128bit modes +(define_mode_attr ssexmmmode + [(V64QI "V16QI") (V32QI "V16QI") (V16QI "V16QI") + (V32HI "V8HI") (V16HI "V8HI") (V8HI "V8HI") + (V16SI "V4SI") (V8SI "V4SI") (V4SI "V4SI") + (V8DI "V2DI") (V4DI "V2DI") (V2DI "V2DI") + (V16SF "V4SF") (V8SF "V4SF") (V4SF "V4SF") + (V8DF "V2DF") (V4DF "V2DF") (V2DF "V2DF")]) ;; Pointer size override for scalar modes (Intel asm dialect) (define_mode_attr iptr @@ -374,8 +536,10 @@ ;; Number of scalar elements in each vector type (define_mode_attr ssescalarnum - [(V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4") + [(V64QI "64") (V16SI "16") (V8DI "8") + (V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4") (V16QI "16") (V8HI "8") (V4SI "4") (V2DI "2") + (V16SF "16") (V8DF "8") (V8SF "8") (V4DF "4") (V4SF "4") (V2DF "2")]) @@ -388,10 +552,12 @@ ;; SSE prefix for integer vector modes (define_mode_attr sseintprefix - [(V2DI "p") (V2DF "") - (V4DI "p") (V4DF "") - (V4SI "p") (V4SF "") - (V8SI "p") (V8SF "")]) + [(V2DI "p") (V2DF "") + (V4DI "p") (V4DF "") + (V8DI "p") (V8DF "") + (V4SI "p") (V4SF "") + (V8SI "p") (V8SF "") + (V16SI "p") (V16SF "")]) ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix @@ -404,11 +570,13 @@ ;; Pack/unpack vector modes (define_mode_attr sseunpackmode [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI") - (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")]) + (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI") + (V32HI "V16SI") (V64QI "V32HI") (V16SI "V8DI")]) (define_mode_attr ssepackmode [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI") - (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")]) + (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI") + (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")]) ;; Mapping of the max integer size for xop rotate immediate constraint (define_mode_attr sserotatemax @@ -421,9 +589,11 @@ (define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. +;; i64x4 or f64x4 for 512bit modes. (define_mode_attr i128 - [(V8SF "f128") (V4DF "f128") (V32QI "%~128") (V16HI "%~128") - (V8SI "%~128") (V4DI "%~128")]) + [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128") + (V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128") + (V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")]) ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) @@ -432,6 +602,10 @@ (define_mode_attr blendbits [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")]) +;; Mapping suffixes for broadcast +(define_mode_attr bcstscalarsuff + [(V16SI "d") (V16SF "ss") (V8DI "q") (V8DF "sd")]) + ;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -571,6 +745,18 @@ ] (const_string "<sseinsnmode>")))]) +(define_insn "avx512f_blendm<mode>" + [(set (match_operand:VI48F_512 0 "register_operand" "=v") + (vec_merge:VI48F_512 + (match_operand:VI48F_512 2 "nonimmediate_operand" "vm") + (match_operand:VI48F_512 1 "register_operand" "v") + (match_operand:<avx512fmaskmode> 3 "register_operand" "k")))] + "TARGET_AVX512F" + "v<sseintprefix>blendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "sse2_movq128" [(set (match_operand:V2DI 0 "register_operand" "=x") (vec_concat:V2DI @@ -665,12 +851,13 @@ (define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF - [(match_operand:VF 1 "memory_operand" "m")] + [(match_operand:VF 1 "nonimmediate_operand" "vm")] UNSPEC_LOADU))] "TARGET_SSE" { switch (get_attr_mode (insn)) { + case MODE_V16SF: case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; @@ -694,12 +881,13 @@ (define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF 0 "memory_operand" "=m") (unspec:VF - [(match_operand:VF 1 "register_operand" "x")] + [(match_operand:VF 1 "register_operand" "v")] UNSPEC_STOREU))] "TARGET_SSE" { switch (get_attr_mode (insn)) { + case MODE_V16SF: case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; @@ -721,10 +909,11 @@ ] (const_string "<MODE>")))]) -(define_insn "<sse2>_loaddqu<avxsizesuffix>" - [(set (match_operand:VI1 0 "register_operand" "=v") - (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")] - UNSPEC_LOADU))] +(define_insn "<sse2_avx_avx512f>_loaddqu<mode>" + [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand" "=v") + (unspec:VI_UNALIGNED_LOADSTORE + [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")] + UNSPEC_LOADU))] "TARGET_SSE2" { switch (get_attr_mode (insn)) @@ -732,6 +921,11 @@ case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; + case MODE_XI: + if (<MODE>mode == V8DImode) + return "vmovdqu64\t{%1, %0|%0, %1}"; + else + return "vmovdqu32\t{%1, %0|%0, %1}"; default: return "%vmovdqu\t{%1, %0|%0, %1}"; } @@ -754,10 +948,11 @@ ] (const_string "<sseinsnmode>")))]) -(define_insn "<sse2>_storedqu<avxsizesuffix>" - [(set (match_operand:VI1 0 "memory_operand" "=m") - (unspec:VI1 [(match_operand:VI1 1 "register_operand" "v")] - UNSPEC_STOREU))] +(define_insn "<sse2_avx_avx512f>_storedqu<mode>" + [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "memory_operand" "=m") + (unspec:VI_UNALIGNED_LOADSTORE + [(match_operand:VI_UNALIGNED_LOADSTORE 1 "register_operand" "v")] + UNSPEC_STOREU))] "TARGET_SSE2" { switch (get_attr_mode (insn)) @@ -765,6 +960,11 @@ case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; + case MODE_XI: + if (<MODE>mode == V8DImode) + return "vmovdqu64\t{%1, %0|%0, %1}"; + else + return "vmovdqu32\t{%1, %0|%0, %1}"; default: return "%vmovdqu\t{%1, %0|%0, %1}"; } @@ -821,8 +1021,9 @@ (define_insn "<sse>_movnt<mode>" [(set (match_operand:VF 0 "memory_operand" "=m") - (unspec:VF [(match_operand:VF 1 "register_operand" "x")] - UNSPEC_MOVNT))] + (unspec:VF + [(match_operand:VF 1 "register_operand" "v")] + UNSPEC_MOVNT))] "TARGET_SSE" "%vmovnt<ssemodesuffix>\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") @@ -831,7 +1032,7 @@ (define_insn "<sse2>_movnt<mode>" [(set (match_operand:VI8 0 "memory_operand" "=m") - (unspec:VI8 [(match_operand:VI8 1 "register_operand" "x")] + (unspec:VI8 [(match_operand:VI8 1 "register_operand" "v")] UNSPEC_MOVNT))] "TARGET_SSE2" "%vmovntdq\t{%1, %0|%0, %1}" @@ -852,9 +1053,9 @@ (define_mode_iterator STORENT_MODE [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2") (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") - (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") - (V8SF "TARGET_AVX") V4SF - (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) (define_expand "storent<mode>" [(set (match_operand:STORENT_MODE 0 "memory_operand") @@ -877,10 +1078,10 @@ "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;") (define_insn_and_split "*absneg<mode>2" - [(set (match_operand:VF 0 "register_operand" "=x,x,x,x") + [(set (match_operand:VF 0 "register_operand" "=x,x,v,v") (match_operator:VF 3 "absneg_operator" - [(match_operand:VF 1 "nonimmediate_operand" "0, xm,x, m")])) - (use (match_operand:VF 2 "nonimmediate_operand" "xm,0, xm,x"))] + [(match_operand:VF 1 "nonimmediate_operand" "0, xm, v, m")])) + (use (match_operand:VF 2 "nonimmediate_operand" "xm, 0, vm,v"))] "TARGET_SSE" "#" "&& reload_completed" @@ -962,10 +1163,10 @@ "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") (define_insn "*mul<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x,x") + [(set (match_operand:VF 0 "register_operand" "=x,v") (mult:VF - (match_operand:VF 1 "nonimmediate_operand" "%0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")))] + (match_operand:VF 1 "nonimmediate_operand" "%0,v") + (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" "@ mul<ssemodesuffix>\t{%2, %0|%0, %2} @@ -976,21 +1177,22 @@ (set_attr "btver2_decode" "direct,double") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vmmul<mode>3" +(define_insn "<sse>_vm<multdiv_mnemonic><mode>3" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 - (mult:VF_128 + (multdiv:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm")) (match_dup 1) (const_int 1)))] "TARGET_SSE" "@ - mul<ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} - vmul<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" + <multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} + v<multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" [(set_attr "isa" "noavx,avx") - (set_attr "type" "ssemul") - (set_attr "prefix" "orig,vex") + (set_attr "type" "sse<multdiv_mnemonic>") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "btver2_decode" "direct,double") (set_attr "mode" "<ssescalarmode>")]) (define_expand "div<mode>3" @@ -1033,28 +1235,10 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<MODE>")]) -(define_insn "<sse>_vmdiv<mode>3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (div:VF_128 - (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm")) - (match_dup 1) - (const_int 1)))] - "TARGET_SSE" - "@ - div<ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} - vdiv<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}" - [(set_attr "isa" "noavx,avx") - (set_attr "type" "ssediv") - (set_attr "prefix" "orig,vex") - (set_attr "btver2_decode" "direct,double") - (set_attr "mode" "<ssescalarmode>")]) - (define_insn "<sse>_rcp<mode>2" - [(set (match_operand:VF1 0 "register_operand" "=x") - (unspec:VF1 - [(match_operand:VF1 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))] + [(set (match_operand:VF1_128_256 0 "register_operand" "=x") + (unspec:VF1_128_256 + [(match_operand:VF1_128_256 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))] "TARGET_SSE" "%vrcpps\t{%1, %0|%0, %1}" [(set_attr "type" "sse") @@ -1081,6 +1265,32 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "rcp14<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_RCP14))] + "TARGET_AVX512F" + "vrcp14<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "srcp14<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 + [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + UNSPEC_RCP14) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vrcp14<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + (define_expand "sqrt<mode>2" [(set (match_operand:VF2 0 "register_operand") (sqrt:VF2 (match_operand:VF2 1 "nonimmediate_operand")))] @@ -1132,9 +1342,9 @@ (set_attr "mode" "<ssescalarmode>")]) (define_expand "rsqrt<mode>2" - [(set (match_operand:VF1 0 "register_operand") - (unspec:VF1 - [(match_operand:VF1 1 "nonimmediate_operand")] UNSPEC_RSQRT))] + [(set (match_operand:VF1_128_256 0 "register_operand") + (unspec:VF1_128_256 + [(match_operand:VF1_128_256 1 "nonimmediate_operand")] UNSPEC_RSQRT))] "TARGET_SSE_MATH" { ix86_emit_swsqrtsf (operands[0], operands[1], <MODE>mode, true); @@ -1142,15 +1352,41 @@ }) (define_insn "<sse>_rsqrt<mode>2" - [(set (match_operand:VF1 0 "register_operand" "=x") - (unspec:VF1 - [(match_operand:VF1 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))] + [(set (match_operand:VF1_128_256 0 "register_operand" "=x") + (unspec:VF1_128_256 + [(match_operand:VF1_128_256 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))] "TARGET_SSE" "%vrsqrtps\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) +(define_insn "rsqrt14<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_RSQRT14))] + "TARGET_AVX512F" + "vrsqrt14<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "rsqrt14<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 + [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + UNSPEC_RSQRT14) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vrsqrt14<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + (define_insn "sse_vmrsqrtv4sf2" [(set (match_operand:V4SF 0 "register_operand" "=x,x") (vec_merge:V4SF @@ -1239,10 +1475,10 @@ ;; presence of -0.0 and NaN. (define_insn "*ieee_smin<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x,x") + [(set (match_operand:VF 0 "register_operand" "=v,v") (unspec:VF - [(match_operand:VF 1 "register_operand" "0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")] + [(match_operand:VF 1 "register_operand" "0,v") + (match_operand:VF 2 "nonimmediate_operand" "vm,vm")] UNSPEC_IEEE_MIN))] "TARGET_SSE" "@ @@ -1254,10 +1490,10 @@ (set_attr "mode" "<MODE>")]) (define_insn "*ieee_smax<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x,x") + [(set (match_operand:VF 0 "register_operand" "=v,v") (unspec:VF - [(match_operand:VF 1 "register_operand" "0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")] + [(match_operand:VF 1 "register_operand" "0,v") + (match_operand:VF 2 "nonimmediate_operand" "vm,vm")] UNSPEC_IEEE_MAX))] "TARGET_SSE" "@ @@ -1536,6 +1772,15 @@ (set_attr "prefix_rep" "1,*") (set_attr "mode" "V4SF")]) +(define_expand "reduc_splus_v8df" + [(match_operand:V8DF 0 "register_operand") + (match_operand:V8DF 1 "register_operand")] + "TARGET_AVX512F" +{ + ix86_expand_reduc (gen_addv8df3, operands[0], operands[1]); + DONE; +}) + (define_expand "reduc_splus_v4df" [(match_operand:V4DF 0 "register_operand") (match_operand:V4DF 1 "register_operand")] @@ -1558,6 +1803,15 @@ DONE; }) +(define_expand "reduc_splus_v16sf" + [(match_operand:V16SF 0 "register_operand") + (match_operand:V16SF 1 "register_operand")] + "TARGET_AVX512F" +{ + ix86_expand_reduc (gen_addv16sf3, operands[0], operands[1]); + DONE; +}) + (define_expand "reduc_splus_v8sf" [(match_operand:V8SF 0 "register_operand") (match_operand:V8SF 1 "register_operand")] @@ -1593,7 +1847,9 @@ [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") - (V4SF "TARGET_SSE")]) + (V4SF "TARGET_SSE") (V16SI "TARGET_AVX512F") + (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V8DF "TARGET_AVX512F")]) (define_expand "reduc_<code>_<mode>" [(smaxmin:REDUC_SMINMAX_MODE @@ -1606,6 +1862,16 @@ }) (define_expand "reduc_<code>_<mode>" + [(umaxmin:VI48_512 + (match_operand:VI48_512 0 "register_operand") + (match_operand:VI48_512 1 "register_operand"))] + "TARGET_AVX512F" +{ + ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_<code>_<mode>" [(umaxmin:VI_256 (match_operand:VI_256 0 "register_operand") (match_operand:VI_256 1 "register_operand"))] @@ -1632,10 +1898,10 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "avx_cmp<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x") - (unspec:VF - [(match_operand:VF 1 "register_operand" "x") - (match_operand:VF 2 "nonimmediate_operand" "xm") + [(set (match_operand:VF_128_256 0 "register_operand" "=x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP))] "TARGET_AVX" @@ -1663,10 +1929,10 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "*<sse>_maskcmp<mode>3_comm" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (match_operator:VF 3 "sse_comparison_operator" - [(match_operand:VF 1 "register_operand" "%0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]))] + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (match_operator:VF_128_256 3 "sse_comparison_operator" + [(match_operand:VF_128_256 1 "register_operand" "%0,x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))] "TARGET_SSE && GET_RTX_CLASS (GET_CODE (operands[3])) == RTX_COMM_COMPARE" "@ @@ -1679,10 +1945,10 @@ (set_attr "mode" "<MODE>")]) (define_insn "<sse>_maskcmp<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (match_operator:VF 3 "sse_comparison_operator" - [(match_operand:VF 1 "register_operand" "0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]))] + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (match_operator:VF_128_256 3 "sse_comparison_operator" + [(match_operand:VF_128_256 1 "register_operand" "0,x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))] "TARGET_SSE" "@ cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2} @@ -1711,14 +1977,46 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<ssescalarmode>")]) +(define_mode_attr cmp_imm_predicate + [(V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand") + (V16SI "const_0_to_7_operand") (V8DI "const_0_to_7_operand")]) + +(define_insn "avx512f_cmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48F_512 1 "register_operand" "v") + (match_operand:VI48F_512 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512F" + "v<sseintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_ucmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP))] + "TARGET_AVX512F" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<sse>_comi" [(set (reg:CCFP FLAGS_REG) (compare:CCFP (vec_select:MODEF - (match_operand:<ssevecmode> 0 "register_operand" "x") + (match_operand:<ssevecmode> 0 "register_operand" "v") (parallel [(const_int 0)])) (vec_select:MODEF - (match_operand:<ssevecmode> 1 "nonimmediate_operand" "xm") + (match_operand:<ssevecmode> 1 "nonimmediate_operand" "vm") (parallel [(const_int 0)]))))] "SSE_FLOAT_MODE_P (<MODE>mode)" "%vcomi<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" @@ -1735,10 +2033,10 @@ [(set (reg:CCFPU FLAGS_REG) (compare:CCFPU (vec_select:MODEF - (match_operand:<ssevecmode> 0 "register_operand" "x") + (match_operand:<ssevecmode> 0 "register_operand" "v") (parallel [(const_int 0)])) (vec_select:MODEF - (match_operand:<ssevecmode> 1 "nonimmediate_operand" "xm") + (match_operand:<ssevecmode> 1 "nonimmediate_operand" "vm") (parallel [(const_int 0)]))))] "SSE_FLOAT_MODE_P (<MODE>mode)" "%vucomi<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" @@ -1751,6 +2049,23 @@ (const_string "0"))) (set_attr "mode" "<MODE>")]) +(define_expand "vcond<V_512:mode><VF_512:mode>" + [(set (match_operand:V_512 0 "register_operand") + (if_then_else:V_512 + (match_operator 3 "" + [(match_operand:VF_512 4 "nonimmediate_operand") + (match_operand:VF_512 5 "nonimmediate_operand")]) + (match_operand:V_512 1 "general_operand") + (match_operand:V_512 2 "general_operand")))] + "TARGET_AVX512F + && (GET_MODE_NUNITS (<V_512:MODE>mode) + == GET_MODE_NUNITS (<VF_512:MODE>mode))" +{ + bool ok = ix86_expand_fp_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcond<V_256:mode><VF_256:mode>" [(set (match_operand:V_256 0 "register_operand") (if_then_else:V_256 @@ -1792,11 +2107,11 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "<sse>_andnot<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x,x") + [(set (match_operand:VF 0 "register_operand" "=x,v") (and:VF (not:VF - (match_operand:VF 1 "register_operand" "0,x")) - (match_operand:VF 2 "nonimmediate_operand" "xm,xm")))] + (match_operand:VF 1 "register_operand" "0,v")) + (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))] "TARGET_SSE" { static char buf[32]; @@ -1825,12 +2140,19 @@ gcc_unreachable (); } + /* There is no vandnp[sd]. Use vpandnq. */ + if (GET_MODE_SIZE (<MODE>mode) == 64) + { + suffix = "q"; + ops = "vpandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}"; + } + snprintf (buf, sizeof (buf), ops, suffix); return buf; } [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set (attr "mode") (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "<ssePSmode>") @@ -1842,13 +2164,21 @@ (const_string "<MODE>")))]) (define_expand "<code><mode>3" - [(set (match_operand:VF 0 "register_operand") - (any_logic:VF - (match_operand:VF 1 "nonimmediate_operand") - (match_operand:VF 2 "nonimmediate_operand")))] + [(set (match_operand:VF_128_256 0 "register_operand") + (any_logic:VF_128_256 + (match_operand:VF_128_256 1 "nonimmediate_operand") + (match_operand:VF_128_256 2 "nonimmediate_operand")))] "TARGET_SSE" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") +(define_expand "<code><mode>3" + [(set (match_operand:VF_512 0 "register_operand") + (fpint_logic:VF_512 + (match_operand:VF_512 1 "nonimmediate_operand") + (match_operand:VF_512 2 "nonimmediate_operand")))] + "TARGET_AVX512F" + "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") + (define_insn "*<code><mode>3" [(set (match_operand:VF 0 "register_operand" "=x,v") (any_logic:VF @@ -1882,12 +2212,19 @@ gcc_unreachable (); } + /* There is no v<logic>p[sd]. Use vp<logic>q. */ + if (GET_MODE_SIZE (<MODE>mode) == 64) + { + suffix = "q"; + ops = "vp<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}"; + } + snprintf (buf, sizeof (buf), ops, suffix); return buf; } [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set (attr "mode") (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "<ssePSmode>") @@ -2105,6 +2442,23 @@ ] (const_string "TI")))]) +;; There are no floating point xor for V16SF and V8DF in avx512f +;; but we need them for negation. Instead we use int versions of +;; xor. Maybe there could be a better way to do that. + +(define_mode_attr avx512flogicsuff + [(V16SF "d") (V8DF "q")]) + +(define_insn "avx512f_<logic><mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (fpint_logic:VF_512 + (match_operand:VF_512 1 "register_operand" "v") + (match_operand:VF_512 2 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F" + "vp<logic><avx512flogicsuff>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; FMA floating point multiply/accumulate instructions. These include @@ -2113,9 +2467,22 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The standard names for scalar FMA are only available with SSE math enabled. -(define_mode_iterator FMAMODEM [(SF "TARGET_SSE_MATH") - (DF "TARGET_SSE_MATH") - V4SF V2DF V8SF V4DF]) +;; CPUID bit AVX512F enables evex encoded scalar and 512-bit fma. It doesn't +;; care about FMA bit, so we enable fma for TARGET_AVX512F even when TARGET_FMA +;; and TARGET_FMA4 are both false. +;; TODO: In theory AVX512F does not automatically imply FMA, and without FMA +;; one must force the EVEX encoding of the fma insns. Ideally we'd improve +;; GAS to allow proper prefix selection. However, for the moment all hardware +;; that supports AVX512F also supports FMA so we can ignore this for now. +(define_mode_iterator FMAMODEM + [(SF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)") + (DF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)") + (V4SF "TARGET_FMA || TARGET_FMA4") + (V2DF "TARGET_FMA || TARGET_FMA4") + (V8SF "TARGET_FMA || TARGET_FMA4") + (V4DF "TARGET_FMA || TARGET_FMA4") + (V16SF "TARGET_AVX512F") + (V8DF "TARGET_AVX512F")]) (define_expand "fma<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -2123,7 +2490,7 @@ (match_operand:FMAMODEM 1 "nonimmediate_operand") (match_operand:FMAMODEM 2 "nonimmediate_operand") (match_operand:FMAMODEM 3 "nonimmediate_operand")))] - "TARGET_FMA || TARGET_FMA4") + "") (define_expand "fms<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -2131,7 +2498,7 @@ (match_operand:FMAMODEM 1 "nonimmediate_operand") (match_operand:FMAMODEM 2 "nonimmediate_operand") (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))] - "TARGET_FMA || TARGET_FMA4") + "") (define_expand "fnma<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -2139,7 +2506,7 @@ (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand")) (match_operand:FMAMODEM 2 "nonimmediate_operand") (match_operand:FMAMODEM 3 "nonimmediate_operand")))] - "TARGET_FMA || TARGET_FMA4") + "") (define_expand "fnms<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -2147,10 +2514,17 @@ (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand")) (match_operand:FMAMODEM 2 "nonimmediate_operand") (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))] - "TARGET_FMA || TARGET_FMA4") + "") ;; The builtins for intrinsics are not constrained by SSE math enabled. -(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) +(define_mode_iterator FMAMODE [(SF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F") + (DF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F") + (V4SF "TARGET_FMA || TARGET_FMA4") + (V2DF "TARGET_FMA || TARGET_FMA4") + (V8SF "TARGET_FMA || TARGET_FMA4") + (V4DF "TARGET_FMA || TARGET_FMA4") + (V16SF "TARGET_AVX512F") + (V8DF "TARGET_AVX512F")]) (define_expand "fma4i_fmadd_<mode>" [(set (match_operand:FMAMODE 0 "register_operand") @@ -2158,7 +2532,7 @@ (match_operand:FMAMODE 1 "nonimmediate_operand") (match_operand:FMAMODE 2 "nonimmediate_operand") (match_operand:FMAMODE 3 "nonimmediate_operand")))] - "TARGET_FMA || TARGET_FMA4") + "") (define_insn "*fma_fmadd_<mode>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") @@ -2166,7 +2540,7 @@ (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x") (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x")))] - "TARGET_FMA || TARGET_FMA4" + "" "@ vfmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2177,14 +2551,14 @@ (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fmsub_<mode>" +(define_insn "fma_fmsub_<mode>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x") (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x"))))] - "TARGET_FMA || TARGET_FMA4" + "" "@ vfmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2195,14 +2569,14 @@ (set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) -(define_insn "*fma_fnmadd_<mode>" +(define_insn "fma_fnmadd_<mode>" [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x") (fma:FMAMODE (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x")) (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x")))] - "TARGET_FMA || TARGET_FMA4" + "" "@ vfnmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfnmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2221,7 +2595,7 @@ (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, v,vm, x,m") (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand" " v,vm, 0,xm,x"))))] - "TARGET_FMA || TARGET_FMA4" + "" "@ vfnmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfnmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2250,7 +2624,7 @@ (match_operand:VF 2 "nonimmediate_operand") (match_operand:VF 3 "nonimmediate_operand")] UNSPEC_FMADDSUB))] - "TARGET_FMA || TARGET_FMA4") + "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F") (define_insn "*fma_fmaddsub_<mode>" [(set (match_operand:VF 0 "register_operand" "=v,v,v,x,x") @@ -2259,7 +2633,7 @@ (match_operand:VF 2 "nonimmediate_operand" "vm, v,vm, x,m") (match_operand:VF 3 "nonimmediate_operand" " v,vm, 0,xm,x")] UNSPEC_FMADDSUB))] - "TARGET_FMA || TARGET_FMA4" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)" "@ vfmaddsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfmaddsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2278,7 +2652,7 @@ (neg:VF (match_operand:VF 3 "nonimmediate_operand" " v,vm, 0,xm,x"))] UNSPEC_FMADDSUB))] - "TARGET_FMA || TARGET_FMA4" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)" "@ vfmsubadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2} vfmsubadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} @@ -2312,7 +2686,7 @@ (match_operand:VF_128 3 "nonimmediate_operand" " v,vm")) (match_dup 1) (const_int 1)))] - "TARGET_FMA" + "TARGET_FMA || TARGET_AVX512F" "@ vfmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} vfmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" @@ -2329,7 +2703,7 @@ (match_operand:VF_128 3 "nonimmediate_operand" " v,vm"))) (match_dup 1) (const_int 1)))] - "TARGET_FMA" + "TARGET_FMA || TARGET_AVX512F" "@ vfmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} vfmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" @@ -2346,7 +2720,7 @@ (match_operand:VF_128 3 "nonimmediate_operand" " v,vm")) (match_dup 1) (const_int 1)))] - "TARGET_FMA" + "TARGET_FMA || TARGET_AVX512F" "@ vfnmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} vfnmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" @@ -2364,7 +2738,7 @@ (match_operand:VF_128 3 "nonimmediate_operand" " v,vm"))) (match_dup 1) (const_int 1)))] - "TARGET_FMA" + "TARGET_FMA || TARGET_AVX512F" "@ vfnmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %<iptr>3, %<iptr>2} vfnmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %<iptr>3}" @@ -2506,7 +2880,7 @@ (set_attr "amdfam10_decode" "vector,double,*") (set_attr "bdver1_decode" "double,direct,*") (set_attr "btver2_decode" "double,double,double") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "SF")]) (define_insn "sse_cvtsi2ssq" @@ -2529,7 +2903,7 @@ (set_attr "btver2_decode" "double,double,double") (set_attr "length_vex" "*,*,4") (set_attr "prefix_rex" "1,1,*") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "SF")]) (define_insn "sse_cvtss2si" @@ -2580,7 +2954,7 @@ (define_insn "sse_cvtss2siq_2" [(set (match_operand:DI 0 "register_operand" "=r,r") - (unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "x,m")] + (unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "v,m")] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE && TARGET_64BIT" "%vcvtss2si{q}\t{%1, %0|%0, %k1}" @@ -2643,20 +3017,16 @@ DONE; }) -(define_insn "avx_cvtps2dq256" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (unspec:V8SI [(match_operand:V8SF 1 "nonimmediate_operand" "xm")] - UNSPEC_FIX_NOTRUNC))] - "TARGET_AVX" - "vcvtps2dq\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) -(define_insn "sse2_cvtps2dq" - [(set (match_operand:V4SI 0 "register_operand" "=x") - (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] - UNSPEC_FIX_NOTRUNC))] +;; For <sse2_avx_avx512f>_fix_notrunc<sf2simodelower><mode> insn pattern +(define_mode_attr sf2simodelower + [(V16SI "v16sf") (V8SI "v8sf") (V4SI "v4sf")]) + +(define_insn "<sse2_avx_avx512f>_fix_notrunc<sf2simodelower><mode>" + [(set (match_operand:VI4_AVX 0 "register_operand" "=v") + (unspec:VI4_AVX + [(match_operand:<ssePSmode> 1 "nonimmediate_operand" "vm")] + UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2" "%vcvtps2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -2666,7 +3036,17 @@ (const_string "*") (const_string "1"))) (set_attr "prefix" "maybe_vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "<fixsuffix>fix_truncv16sfv16si2" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (any_fix:V16SI + (match_operand:V16SF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F" + "vcvttps2<fixsuffix>dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) (define_insn "fix_truncv8sfv8si2" [(set (match_operand:V8SI 0 "register_operand" "=x") @@ -2772,11 +3152,11 @@ (set_attr "mode" "DF")]) (define_insn "sse2_cvtsi2sdq" - [(set (match_operand:V2DF 0 "register_operand" "=x,x,x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") (vec_merge:V2DF (vec_duplicate:V2DF (float:DF (match_operand:DI 2 "nonimmediate_operand" "r,m,rm"))) - (match_operand:V2DF 1 "register_operand" "0,0,x") + (match_operand:V2DF 1 "register_operand" "0,0,v") (const_int 1)))] "TARGET_SSE2 && TARGET_64BIT" "@ @@ -2790,14 +3170,14 @@ (set_attr "bdver1_decode" "double,direct,*") (set_attr "length_vex" "*,*,4") (set_attr "prefix_rex" "1,1,*") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "DF")]) (define_insn "sse2_cvtsd2si" [(set (match_operand:SI 0 "register_operand" "=r,r") (unspec:SI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (match_operand:V2DF 1 "nonimmediate_operand" "v,m") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2" @@ -2828,7 +3208,7 @@ [(set (match_operand:DI 0 "register_operand" "=r,r") (unspec:DI [(vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (match_operand:V2DF 1 "nonimmediate_operand" "v,m") (parallel [(const_int 0)]))] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2 && TARGET_64BIT" @@ -2858,7 +3238,7 @@ [(set (match_operand:SI 0 "register_operand" "=r,r") (fix:SI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (match_operand:V2DF 1 "nonimmediate_operand" "v,m") (parallel [(const_int 0)]))))] "TARGET_SSE2" "%vcvttsd2si\t{%1, %0|%0, %q1}" @@ -2875,7 +3255,7 @@ [(set (match_operand:DI 0 "register_operand" "=r,r") (fix:DI (vec_select:DF - (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (match_operand:V2DF 1 "nonimmediate_operand" "v,m") (parallel [(const_int 0)]))))] "TARGET_SSE2 && TARGET_64BIT" "%vcvttsd2si{q}\t{%1, %0|%0, %q1}" @@ -2887,14 +3267,20 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "floatv4siv4df2" - [(set (match_operand:V4DF 0 "register_operand" "=x") - (float:V4DF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] +;; For float<si2dfmode><mode>2 insn pattern +(define_mode_attr si2dfmode + [(V8DF "V8SI") (V4DF "V4SI")]) +(define_mode_attr si2dfmodelower + [(V8DF "v8si") (V4DF "v4si")]) + +(define_insn "float<si2dfmodelower><mode>2" + [(set (match_operand:VF2_512_256 0 "register_operand" "=v") + (float:VF2_512_256 (match_operand:<si2dfmode> 1 "nonimmediate_operand" "vm")))] "TARGET_AVX" "vcvtdq2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "prefix" "vex") - (set_attr "mode" "V4DF")]) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) (define_insn "avx_cvtdq2pd256_2" [(set (match_operand:V4DF 0 "register_operand" "=x") @@ -2984,6 +3370,15 @@ (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double")]) +(define_insn "<fixsuffix>fix_truncv8dfv8si2" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (any_fix:V8SI (match_operand:V8DF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F" + "vcvttpd2<fixsuffix>dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "OI")]) + (define_insn "fix_truncv4dfv4si2" [(set (match_operand:V4SI 0 "register_operand" "=x") (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))] @@ -3126,15 +3521,19 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "V4SF")]) -(define_insn "avx_cvtps2pd256" - [(set (match_operand:V4DF 0 "register_operand" "=x") - (float_extend:V4DF - (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] +;; For <sse2_avx_avx512f>_cvtps2pd<avxsizesuffix> insn pattern +(define_mode_attr sf2dfmode + [(V8DF "V8SF") (V4DF "V4SF")]) + +(define_insn "<sse2_avx_avx512f>_cvtps2pd<avxsizesuffix>" + [(set (match_operand:VF2_512_256 0 "register_operand" "=v") + (float_extend:VF2_512_256 + (match_operand:<sf2dfmode> 1 "nonimmediate_operand" "vm")))] "TARGET_AVX" "vcvtps2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "prefix" "vex") - (set_attr "mode" "V4DF")]) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) (define_insn "*avx_cvtps2pd256_2" [(set (match_operand:V4DF 0 "register_operand" "=x") @@ -3211,11 +3610,12 @@ "TARGET_AVX") (define_mode_attr sseunpackfltmode - [(V8HI "V4SF") (V4SI "V2DF") (V16HI "V8SF") (V8SI "V4DF")]) + [(V8HI "V4SF") (V4SI "V2DF") (V16HI "V8SF") + (V8SI "V4DF") (V32HI "V16SF") (V16SI "V8DF")]) (define_expand "vec_unpacks_float_hi_<mode>" [(match_operand:<sseunpackfltmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand")] + (match_operand:VI2_AVX512F 1 "register_operand")] "TARGET_SSE2" { rtx tmp = gen_reg_rtx (<sseunpackmode>mode); @@ -3228,7 +3628,7 @@ (define_expand "vec_unpacks_float_lo_<mode>" [(match_operand:<sseunpackfltmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand")] + (match_operand:VI2_AVX512F 1 "register_operand")] "TARGET_SSE2" { rtx tmp = gen_reg_rtx (<sseunpackmode>mode); @@ -3241,7 +3641,7 @@ (define_expand "vec_unpacku_float_hi_<mode>" [(match_operand:<sseunpackfltmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand")] + (match_operand:VI2_AVX512F 1 "register_operand")] "TARGET_SSE2" { rtx tmp = gen_reg_rtx (<sseunpackmode>mode); @@ -3254,7 +3654,7 @@ (define_expand "vec_unpacku_float_lo_<mode>" [(match_operand:<sseunpackfltmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand")] + (match_operand:VI2_AVX512F 1 "register_operand")] "TARGET_SSE2" { rtx tmp = gen_reg_rtx (<sseunpackmode>mode); @@ -3425,21 +3825,21 @@ DONE; }) -(define_expand "vec_pack_trunc_v4df" +(define_expand "vec_pack_trunc_<mode>" [(set (match_dup 3) - (float_truncate:V4SF - (match_operand:V4DF 1 "nonimmediate_operand"))) + (float_truncate:<sf2dfmode> + (match_operand:VF2_512_256 1 "nonimmediate_operand"))) (set (match_dup 4) - (float_truncate:V4SF - (match_operand:V4DF 2 "nonimmediate_operand"))) - (set (match_operand:V8SF 0 "register_operand") - (vec_concat:V8SF + (float_truncate:<sf2dfmode> + (match_operand:VF2_512_256 2 "nonimmediate_operand"))) + (set (match_operand:<ssePSmode> 0 "register_operand") + (vec_concat:<ssePSmode> (match_dup 3) (match_dup 4)))] "TARGET_AVX" { - operands[3] = gen_reg_rtx (V4SFmode); - operands[4] = gen_reg_rtx (V4SFmode); + operands[3] = gen_reg_rtx (<sf2dfmode>mode); + operands[4] = gen_reg_rtx (<sf2dfmode>mode); }) (define_expand "vec_pack_trunc_v2df" @@ -3470,6 +3870,23 @@ DONE; }) +(define_expand "vec_pack_sfix_trunc_v8df" + [(match_operand:V16SI 0 "register_operand") + (match_operand:V8DF 1 "nonimmediate_operand") + (match_operand:V8DF 2 "nonimmediate_operand")] + "TARGET_AVX512F" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V8SImode); + r2 = gen_reg_rtx (V8SImode); + + emit_insn (gen_fix_truncv8dfv8si2 (r1, operands[1])); + emit_insn (gen_fix_truncv8dfv8si2 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2)); + DONE; +}) + (define_expand "vec_pack_sfix_trunc_v4df" [(match_operand:V8SI 0 "register_operand") (match_operand:V4DF 1 "nonimmediate_operand") @@ -3519,12 +3936,12 @@ }) (define_mode_attr ssepackfltmode - [(V4DF "V8SI") (V2DF "V4SI")]) + [(V8DF "V16SI") (V4DF "V8SI") (V2DF "V4SI")]) (define_expand "vec_pack_ufix_trunc_<mode>" [(match_operand:<ssepackfltmode> 0 "register_operand") - (match_operand:VF2 1 "register_operand") - (match_operand:VF2 2 "register_operand")] + (match_operand:VF2_128_256 1 "register_operand") + (match_operand:VF2_128_256 2 "register_operand")] "TARGET_SSE2" { rtx tmp[7]; @@ -4128,8 +4545,8 @@ (set_attr "mode" "SF")]) (define_insn "avx2_vec_dup<mode>" - [(set (match_operand:VF1 0 "register_operand" "=x") - (vec_duplicate:VF1 + [(set (match_operand:VF1_128_256 0 "register_operand" "=x") + (vec_duplicate:VF1_128_256 (vec_select:SF (match_operand:V4SF 1 "register_operand" "x") (parallel [(const_int 0)]))))] @@ -4428,6 +4845,86 @@ operands[1] = adjust_address (operands[1], SFmode, INTVAL (operands[2]) * 4); }) +(define_insn "avx512f_vextract<shuffletype>32x4_1" + [(set (match_operand:<ssequartermode> 0 "nonimmediate_operand" "=vm") + (vec_select:<ssequartermode> + (match_operand:V16FI 1 "register_operand" "v") + (parallel [(match_operand 2 "const_0_to_15_operand") + (match_operand 3 "const_0_to_15_operand") + (match_operand 4 "const_0_to_15_operand") + (match_operand 5 "const_0_to_15_operand")])))] + "TARGET_AVX512F && (INTVAL (operands[2]) = INTVAL (operands[3]) - 1) + && (INTVAL (operands[3]) = INTVAL (operands[4]) - 1) + && (INTVAL (operands[4]) = INTVAL (operands[5]) - 1)" +{ + operands[2] = GEN_INT ((INTVAL (operands[2])) >> 2); + return "vextract<shuffletype>32x4\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set (attr "memory") + (if_then_else (match_test "MEM_P (operands[0])") + (const_string "store") + (const_string "none"))) + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_split + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand") + (vec_select:<ssehalfvecmode> + (match_operand:V8FI 1 "nonimmediate_operand") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op1)); + else + op1 = gen_lowpart (<ssehalfvecmode>mode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_lo_<mode>" + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=vm") + (vec_select:<ssehalfvecmode> + (match_operand:V8FI 1 "nonimmediate_operand" "vm") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set (attr "memory") + (if_then_else (match_test "MEM_P (operands[0])") + (const_string "store") + (const_string "none"))) + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "vec_extract_hi_<mode>" + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=vm") + (vec_select:<ssehalfvecmode> + (match_operand:V8FI 1 "register_operand" "v") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512F" + "vextract<shuffletype>64x4\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set (attr "memory") + (if_then_else (match_test "MEM_P (operands[0])") + (const_string "store") + (const_string "none"))) + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "avx_vextractf128<mode>" [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand") (match_operand:V_256 1 "register_operand") @@ -4453,6 +4950,45 @@ }) (define_insn_and_split "vec_extract_lo_<mode>" + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m") + (vec_select:<ssehalfvecmode> + (match_operand:V16FI 1 "nonimmediate_operand" "vm,v") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op1)); + else + op1 = gen_lowpart (<ssehalfvecmode>mode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_hi_<mode>" + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m") + (vec_select:<ssehalfvecmode> + (match_operand:V16FI 1 "nonimmediate_operand" "v,v") + (parallel [(const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX512F" + "vextracti64x4\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,store") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn_and_split "vec_extract_lo_<mode>" [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m") (vec_select:<ssehalfvecmode> (match_operand:VI8F_256 1 "nonimmediate_operand" "xm,x") @@ -4832,6 +5368,133 @@ operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8); }) +(define_insn "avx512f_vmscalef<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + UNSPEC_SCALEF) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "%vscalef<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "avx512f_scalef<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "v") + (match_operand:VF_512 2 "nonimmediate_operand" "vm")] + UNSPEC_SCALEF))] + "TARGET_AVX512F" + "%vscalef<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_vternlog<mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (unspec:VI48_512 + [(match_operand:VI48_512 1 "register_operand" "0") + (match_operand:VI48_512 2 "register_operand" "v") + (match_operand:VI48_512 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_VTERNLOG))] + "TARGET_AVX512F" + "vpternlog<ssemodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_getexp<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_GETEXP))] + "TARGET_AVX512F" + "vgetexp<ssemodesuffix>\t{%1, %0|%0, %1}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_sgetexp<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm")] + UNSPEC_GETEXP) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vgetexp<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "avx512f_align<mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (unspec:VI48_512 [(match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_255_operand")] + UNSPEC_ALIGN))] + "TARGET_AVX512F" + "valign<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_fixupimm<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "register_operand" "0") + (match_operand:VF_512 2 "register_operand" "v") + (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_FIXUPIMM))] + "TARGET_AVX512F" + "vfixupimm<ssemodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_sfixupimm<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 + [(match_operand:VF_128 1 "register_operand" "0") + (match_operand:VF_128 2 "register_operand" "v") + (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_FIXUPIMM) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vfixupimm<ssescalarmodesuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "avx512f_rndscale<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm") + (match_operand:SI 2 "const_0_to_255_operand")] + UNSPEC_ROUND))] + "TARGET_AVX512F" + "vrndscale<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_rndscale<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 + [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_255_operand")] + UNSPEC_ROUND) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vrndscale<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + (define_expand "avx_shufpd256" [(match_operand:V4DF 0 "register_operand") (match_operand:V4DF 1 "register_operand") @@ -5505,8 +6168,7 @@ (const_int 4) (const_int 6)])))))] "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)" "vpmuldq\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "avx") - (set_attr "type" "sseimul") + [(set_attr "type" "sseimul") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -5679,10 +6341,10 @@ (set_attr "mode" "TI")]) (define_expand "mul<mode>3" - [(set (match_operand:VI4_AVX2 0 "register_operand") - (mult:VI4_AVX2 - (match_operand:VI4_AVX2 1 "general_vector_operand") - (match_operand:VI4_AVX2 2 "general_vector_operand")))] + [(set (match_operand:VI4_AVX512F 0 "register_operand") + (mult:VI4_AVX512F + (match_operand:VI4_AVX512F 1 "general_vector_operand") + (match_operand:VI4_AVX512F 2 "general_vector_operand")))] "TARGET_SSE2" { if (TARGET_SSE4_1) @@ -5701,10 +6363,10 @@ }) (define_insn "*<sse4_1_avx2>_mul<mode>3" - [(set (match_operand:VI4_AVX2 0 "register_operand" "=x,v") - (mult:VI4_AVX2 - (match_operand:VI4_AVX2 1 "nonimmediate_operand" "%0,v") - (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm,vm")))] + [(set (match_operand:VI4_AVX512F 0 "register_operand" "=x,v") + (mult:VI4_AVX512F + (match_operand:VI4_AVX512F 1 "nonimmediate_operand" "%0,v") + (match_operand:VI4_AVX512F 2 "nonimmediate_operand" "xm,vm")))] "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" "@ pmulld\t{%2, %0|%0, %2} @@ -5717,9 +6379,10 @@ (set_attr "mode" "<sseinsnmode>")]) (define_expand "mul<mode>3" - [(set (match_operand:VI8_AVX2 0 "register_operand") - (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand") - (match_operand:VI8_AVX2 2 "register_operand")))] + [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand") + (mult:VI8_AVX2_AVX512F + (match_operand:VI8_AVX2_AVX512F 1 "register_operand") + (match_operand:VI8_AVX2_AVX512F 2 "register_operand")))] "TARGET_SSE2" { ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]); @@ -5766,8 +6429,8 @@ (define_expand "vec_widen_<s>mult_odd_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (any_extend:<sseunpackmode> - (match_operand:VI4_AVX2 1 "general_vector_operand")) - (match_operand:VI4_AVX2 2 "general_vector_operand")] + (match_operand:VI4_AVX512F 1 "general_vector_operand")) + (match_operand:VI4_AVX512F 2 "general_vector_operand")] "TARGET_SSE2" { ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], @@ -5825,9 +6488,9 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<shift_insn><mode>3" - [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,v") + [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x") (any_lshift:VI248_AVX2 - (match_operand:VI248_AVX2 1 "register_operand" "0,v") + (match_operand:VI248_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] "TARGET_SSE2" "@ @@ -5843,6 +6506,22 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "<shift_insn><mode>3" + [(set (match_operand:VI48_512 0 "register_operand" "=v,v") + (any_lshift:VI48_512 + (match_operand:VI48_512 1 "register_operand" "v,m") + (match_operand:SI 2 "nonmemory_operand" "vN,N")))] + "TARGET_AVX512F" + "vp<vshift><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "avx512f") + (set_attr "type" "sseishft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand") + (const_string "1") + (const_string "0"))) + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "vec_shl_<mode>" [(set (match_operand:VI_128 0 "register_operand") (ashift:V1TI @@ -5918,25 +6597,44 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "avx512f_<rotate>v<mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (any_rotate:VI48_512 + (match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F" + "vp<rotate>v<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_<rotate><mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (any_rotate:VI48_512 + (match_operand:VI48_512 1 "nonimmediate_operand" "vm") + (match_operand:SI 2 "const_0_to_255_operand")))] + "TARGET_AVX512F" + "vp<rotate><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) (define_expand "<code><mode>3" - [(set (match_operand:VI124_256 0 "register_operand") - (maxmin:VI124_256 - (match_operand:VI124_256 1 "nonimmediate_operand") - (match_operand:VI124_256 2 "nonimmediate_operand")))] + [(set (match_operand:VI124_256_48_512 0 "register_operand") + (maxmin:VI124_256_48_512 + (match_operand:VI124_256_48_512 1 "nonimmediate_operand") + (match_operand:VI124_256_48_512 2 "nonimmediate_operand")))] "TARGET_AVX2" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_insn "*avx2_<code><mode>3" - [(set (match_operand:VI124_256 0 "register_operand" "=v") - (maxmin:VI124_256 - (match_operand:VI124_256 1 "nonimmediate_operand" "%v") - (match_operand:VI124_256 2 "nonimmediate_operand" "vm")))] + [(set (match_operand:VI124_256_48_512 0 "register_operand" "=v") + (maxmin:VI124_256_48_512 + (match_operand:VI124_256_48_512 1 "nonimmediate_operand" "%v") + (match_operand:VI124_256_48_512 2 "nonimmediate_operand" "vm")))] "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseiadd") (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") + (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) (define_expand "<code><mode>3" @@ -6151,6 +6849,28 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "avx512f_eq<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand") + (match_operand:VI48_512 2 "nonimmediate_operand")] + UNSPEC_MASKED_EQ))] + "TARGET_AVX512F" + "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);") + +(define_insn "avx512f_eq<mode>3_1" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand" "%v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] + UNSPEC_MASKED_EQ))] + "TARGET_AVX512F && ix86_binary_operator_ok (EQ, <MODE>mode, operands)" + "vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "*sse4_1_eqv2di3" [(set (match_operand:V2DI 0 "register_operand" "=x,x") (eq:V2DI @@ -6225,6 +6945,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_insn "avx512f_gt<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] + "TARGET_AVX512F" + "vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "sse2_gt<mode>3" [(set (match_operand:VI124_128 0 "register_operand" "=x,x") (gt:VI124_128 @@ -6240,6 +6972,23 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "vcond<V_512:mode><VI_512:mode>" + [(set (match_operand:V_512 0 "register_operand") + (if_then_else:V_512 + (match_operator 3 "" + [(match_operand:VI_512 4 "nonimmediate_operand") + (match_operand:VI_512 5 "general_operand")]) + (match_operand:V_512 1) + (match_operand:V_512 2)))] + "TARGET_AVX512F + && (GET_MODE_NUNITS (<V_512:MODE>mode) + == GET_MODE_NUNITS (<VI_512:MODE>mode))" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcond<V_256:mode><VI_256:mode>" [(set (match_operand:V_256 0 "register_operand") (if_then_else:V_256 @@ -6289,6 +7038,23 @@ DONE; }) +(define_expand "vcondu<V_512:mode><VI_512:mode>" + [(set (match_operand:V_512 0 "register_operand") + (if_then_else:V_512 + (match_operator 3 "" + [(match_operand:VI_512 4 "nonimmediate_operand") + (match_operand:VI_512 5 "nonimmediate_operand")]) + (match_operand:V_512 1 "general_operand") + (match_operand:V_512 2 "general_operand")))] + "TARGET_AVX512F + && (GET_MODE_NUNITS (<V_512:MODE>mode) + == GET_MODE_NUNITS (<VI_512:MODE>mode))" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcondu<V_256:mode><VI_256:mode>" [(set (match_operand:V_256 0 "register_operand") (if_then_else:V_256 @@ -6411,12 +7177,18 @@ (match_operand:VI 2 "nonimmediate_operand" "xm,vm")))] "TARGET_SSE" { - static char buf[32]; + static char buf[64]; const char *ops; const char *tmp; switch (get_attr_mode (insn)) { + case MODE_XI: + gcc_assert (TARGET_AVX512F); + + tmp = "pandn<ssemodesuffix>"; + break; + case MODE_OI: gcc_assert (TARGET_AVX2); case MODE_TI: @@ -6496,12 +7268,17 @@ "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" { - static char buf[32]; + static char buf[64]; const char *ops; const char *tmp; switch (get_attr_mode (insn)) { + case MODE_XI: + gcc_assert (TARGET_AVX512F); + tmp = "p<logic><ssemodesuffix>"; + break; + case MODE_OI: gcc_assert (TARGET_AVX2); case MODE_TI: @@ -6510,6 +7287,8 @@ tmp = "p<logic>"; break; + case MODE_V16SF: + gcc_assert (TARGET_AVX512F); case MODE_V8SF: gcc_assert (TARGET_AVX); case MODE_V4SF: @@ -6562,6 +7341,28 @@ ] (const_string "<sseinsnmode>")))]) +(define_insn "avx512f_testm<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTM))] + "TARGET_AVX512F" + "vptestm<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_testnm<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_512 1 "register_operand" "v") + (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTNM))] + "TARGET_AVX512CD" + "%vptestnm<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral element swizzling @@ -6570,8 +7371,8 @@ (define_expand "vec_pack_trunc_<mode>" [(match_operand:<ssepackmode> 0 "register_operand") - (match_operand:VI248_AVX2 1 "register_operand") - (match_operand:VI248_AVX2 2 "register_operand")] + (match_operand:VI248_AVX2_8_AVX512F 1 "register_operand") + (match_operand:VI248_AVX2_8_AVX512F 2 "register_operand")] "TARGET_SSE2" { rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]); @@ -6983,6 +7784,198 @@ (set_attr "prefix" "orig,orig,vex,vex") (set_attr "mode" "TI")]) +(define_insn "avx512f_vinsert<shuffletype>32x4_1" + [(set (match_operand:V16FI 0 "register_operand" "=v") + (vec_merge:V16FI + (match_operand:V16FI 1 "register_operand" "v") + (vec_duplicate:V16FI + (match_operand:<ssequartermode> 2 "nonimmediate_operand" "vm")) + (match_operand:SI 3 "const_int_operand" "n")))] + "TARGET_AVX512F" +{ + int mask; + if (INTVAL (operands[3]) == 0xFFF) + mask = 0; + else if ( INTVAL (operands[3]) == 0xF0FF) + mask = 1; + else if ( INTVAL (operands[3]) == 0xFF0F) + mask = 2; + else if ( INTVAL (operands[3]) == 0xFFF0) + mask = 3; + else + gcc_unreachable (); + + operands[3] = GEN_INT (mask); + + return "vinsert<shuffletype>32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "vec_set_lo_<mode>" + [(set (match_operand:V8FI 0 "register_operand" "=v") + (vec_concat:V8FI + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "vm") + (vec_select:<ssehalfvecmode> + (match_operand:V8FI 1 "register_operand" "v") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX512F" + "vinsert<shuffletype>64x4\t{$0x0, %2, %1, %0|%0, %1, %2, $0x0}" + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "vec_set_hi_<mode>" + [(set (match_operand:V8FI 0 "register_operand" "=v") + (vec_concat:V8FI + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "vm") + (vec_select:<ssehalfvecmode> + (match_operand:V8FI 1 "register_operand" "v") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX512F" + "vinsert<shuffletype>64x4\t{$0x1, %2, %1, %0|%0, %1, %2, $0x1}" + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "avx512f_shuf_<shuffletype>64x2_1" + [(set (match_operand:V8FI 0 "register_operand" "=v") + (vec_select:V8FI + (vec_concat:<ssedoublemode> + (match_operand:V8FI 1 "register_operand" "v") + (match_operand:V8FI 2 "nonimmediate_operand" "vm")) + (parallel [(match_operand 3 "const_0_to_7_operand") + (match_operand 4 "const_0_to_7_operand") + (match_operand 5 "const_0_to_7_operand") + (match_operand 6 "const_0_to_7_operand") + (match_operand 7 "const_8_to_15_operand") + (match_operand 8 "const_8_to_15_operand") + (match_operand 9 "const_8_to_15_operand") + (match_operand 10 "const_8_to_15_operand")])))] + "TARGET_AVX512F + && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) + && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1) + && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) + && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))" +{ + int mask; + mask = INTVAL (operands[3]) / 2; + mask |= INTVAL (operands[5]) / 2 << 2; + mask |= (INTVAL (operands[7]) - 8) / 2 << 4; + mask |= (INTVAL (operands[9]) - 8) / 2 << 6; + operands[3] = GEN_INT (mask); + + return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_shuf_<shuffletype>32x4_1" + [(set (match_operand:V16FI 0 "register_operand" "=v") + (vec_select:V16FI + (vec_concat:<ssedoublemode> + (match_operand:V16FI 1 "register_operand" "v") + (match_operand:V16FI 2 "nonimmediate_operand" "vm")) + (parallel [(match_operand 3 "const_0_to_15_operand") + (match_operand 4 "const_0_to_15_operand") + (match_operand 5 "const_0_to_15_operand") + (match_operand 6 "const_0_to_15_operand") + (match_operand 7 "const_0_to_15_operand") + (match_operand 8 "const_0_to_15_operand") + (match_operand 9 "const_0_to_15_operand") + (match_operand 10 "const_0_to_15_operand") + (match_operand 11 "const_16_to_31_operand") + (match_operand 12 "const_16_to_31_operand") + (match_operand 13 "const_16_to_31_operand") + (match_operand 14 "const_16_to_31_operand") + (match_operand 15 "const_16_to_31_operand") + (match_operand 16 "const_16_to_31_operand") + (match_operand 17 "const_16_to_31_operand") + (match_operand 18 "const_16_to_31_operand")])))] + "TARGET_AVX512F + && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) + && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2) + && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3) + && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) + && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2) + && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3) + && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1) + && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2) + && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3) + && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1) + && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2) + && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))" +{ + int mask; + mask = INTVAL (operands[3]) / 4; + mask |= INTVAL (operands[7]) / 4 << 2; + mask |= (INTVAL (operands[11]) - 16) / 4 << 4; + mask |= (INTVAL (operands[15]) - 16) / 4 << 6; + operands[3] = GEN_INT (mask); + + return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_pshufd_1" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (vec_select:V16SI + (match_operand:V16SI 1 "nonimmediate_operand" "vm") + (parallel [(match_operand 2 "const_0_to_3_operand") + (match_operand 3 "const_0_to_3_operand") + (match_operand 4 "const_0_to_3_operand") + (match_operand 5 "const_0_to_3_operand") + (match_operand 6 "const_4_to_7_operand") + (match_operand 7 "const_4_to_7_operand") + (match_operand 8 "const_4_to_7_operand") + (match_operand 9 "const_4_to_7_operand") + (match_operand 10 "const_8_to_11_operand") + (match_operand 11 "const_8_to_11_operand") + (match_operand 12 "const_8_to_11_operand") + (match_operand 13 "const_8_to_11_operand") + (match_operand 14 "const_12_to_15_operand") + (match_operand 15 "const_12_to_15_operand") + (match_operand 16 "const_12_to_15_operand") + (match_operand 17 "const_12_to_15_operand")])))] + "TARGET_AVX512F + && INTVAL (operands[2]) + 4 == INTVAL (operands[6]) + && INTVAL (operands[3]) + 4 == INTVAL (operands[7]) + && INTVAL (operands[4]) + 4 == INTVAL (operands[8]) + && INTVAL (operands[5]) + 4 == INTVAL (operands[9]) + && INTVAL (operands[2]) + 8 == INTVAL (operands[10]) + && INTVAL (operands[3]) + 8 == INTVAL (operands[11]) + && INTVAL (operands[4]) + 8 == INTVAL (operands[12]) + && INTVAL (operands[5]) + 8 == INTVAL (operands[13]) + && INTVAL (operands[2]) + 12 == INTVAL (operands[14]) + && INTVAL (operands[3]) + 12 == INTVAL (operands[15]) + && INTVAL (operands[4]) + 12 == INTVAL (operands[16]) + && INTVAL (operands[5]) + 12 == INTVAL (operands[17])" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "vpshufd\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex") + (set_attr "length_immediate" "1") + (set_attr "mode" "XI")]) + (define_expand "avx2_pshufdv3" [(match_operand:V8SI 0 "register_operand") (match_operand:V8SI 1 "nonimmediate_operand") @@ -7657,25 +8650,25 @@ (define_expand "vec_unpacks_lo_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI124_AVX2 1 "register_operand")] + (match_operand:VI124_AVX512F 1 "register_operand")] "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;") (define_expand "vec_unpacks_hi_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI124_AVX2 1 "register_operand")] + (match_operand:VI124_AVX512F 1 "register_operand")] "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;") (define_expand "vec_unpacku_lo_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI124_AVX2 1 "register_operand")] + (match_operand:VI124_AVX512F 1 "register_operand")] "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;") (define_expand "vec_unpacku_hi_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI124_AVX2 1 "register_operand")] + (match_operand:VI124_AVX512F 1 "register_operand")] "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") @@ -7747,7 +8740,7 @@ (define_insn "<sse>_movmsk<ssemodesuffix><avxsizesuffix>" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI - [(match_operand:VF 1 "register_operand" "x")] + [(match_operand:VF_128_256 1 "register_operand" "x")] UNSPEC_MOVMSK))] "TARGET_SSE" "%vmovmsk<ssemodesuffix>\t{%1, %0|%0, %1}" @@ -8428,9 +9421,9 @@ (set_attr "mode" "DI")]) (define_insn "abs<mode>2" - [(set (match_operand:VI124_AVX2 0 "register_operand" "=v") - (abs:VI124_AVX2 - (match_operand:VI124_AVX2 1 "nonimmediate_operand" "vm")))] + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") + (abs:VI124_AVX2_48_AVX512F + (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] "TARGET_SSSE3" "%vpabs<ssemodesuffix>\t{%1, %0|%0, %1}" [(set_attr "type" "sselog1") @@ -8537,10 +9530,10 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (vec_merge:VF - (match_operand:VF 2 "nonimmediate_operand" "xm,xm") - (match_operand:VF 1 "register_operand" "0,x") + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (vec_merge:VF_128_256 + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm") + (match_operand:VF_128_256 1 "register_operand" "0,x") (match_operand:SI 3 "const_0_to_<blendbits>_operand")))] "TARGET_SSE4_1" "@ @@ -8555,11 +9548,11 @@ (set_attr "mode" "<MODE>")]) (define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (unspec:VF - [(match_operand:VF 1 "register_operand" "0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm") - (match_operand:VF 3 "register_operand" "Yz,x")] + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "0,x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm") + (match_operand:VF_128_256 3 "register_operand" "Yz,x")] UNSPEC_BLENDV))] "TARGET_SSE4_1" "@ @@ -8575,10 +9568,10 @@ (set_attr "mode" "<MODE>")]) (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm,xm") + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "nonimmediate_operand" "%0,x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm") (match_operand:SI 3 "const_0_to_255_operand" "n,n")] UNSPEC_DP))] "TARGET_SSE4_1" @@ -8909,8 +9902,8 @@ ;; setting FLAGS_REG. But it is not a really compare instruction. (define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>" [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_operand:VF 0 "register_operand" "x") - (match_operand:VF 1 "nonimmediate_operand" "xm")] + (unspec:CC [(match_operand:VF_128_256 0 "register_operand" "x") + (match_operand:VF_128_256 1 "nonimmediate_operand" "xm")] UNSPEC_VTESTP))] "TARGET_AVX" "vtest<ssemodesuffix>\t{%1, %0|%0, %1}" @@ -8947,9 +9940,9 @@ (set_attr "mode" "TI")]) (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "register_operand" "=x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "xm") + [(set (match_operand:VF_128_256 0 "register_operand" "=x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "nonimmediate_operand" "xm") (match_operand:SI 2 "const_0_to_15_operand" "n")] UNSPEC_ROUND))] "TARGET_ROUND" @@ -8967,7 +9960,7 @@ (define_expand "<sse4_1>_round<ssemodesuffix>_sfix<avxsizesuffix>" [(match_operand:<sseintvecmode> 0 "register_operand") - (match_operand:VF1 1 "nonimmediate_operand") + (match_operand:VF1_128_256 1 "nonimmediate_operand") (match_operand:SI 2 "const_0_to_15_operand")] "TARGET_ROUND" { @@ -8981,6 +9974,16 @@ DONE; }) +(define_expand "avx512f_roundpd512" + [(match_operand:V8DF 0 "register_operand") + (match_operand:V8DF 1 "nonimmediate_operand") + (match_operand:SI 2 "const_0_to_15_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_rndscalev8df (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>" [(match_operand:<ssepackfltmode> 0 "register_operand") (match_operand:VF2 1 "nonimmediate_operand") @@ -9076,7 +10079,7 @@ (define_expand "round<mode>2_sfix" [(match_operand:<sseintvecmode> 0 "register_operand") - (match_operand:VF1 1 "register_operand")] + (match_operand:VF1_128_256 1 "register_operand")] "TARGET_ROUND && !flag_trapping_math" { rtx tmp = gen_reg_rtx (<MODE>mode); @@ -9508,6 +10511,178 @@ (set_attr "btver2_decode" "vector,vector,vector,vector") (set_attr "mode" "TI")]) +(define_expand "avx512pf_gatherpf<mode>" + [(unspec + [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand") + (mem:<ssescalarmode> + (match_par_dup 5 + [(match_operand 2 "vsib_address_operand") + (match_operand:VI48_512 1 "register_operand") + (match_operand:SI 3 "const1248_operand")])) + (match_operand:SI 4 "const_0_to_1_operand")] + UNSPEC_GATHER_PREFETCH)] + "TARGET_AVX512PF" +{ + operands[5] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1], + operands[3]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512pf_gatherpf<mode>_mask" + [(unspec + [(match_operand:<avx512fmaskmode> 0 "register_operand" "k") + (match_operator:<ssescalarmode> 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand:VI48_512 1 "register_operand" "v") + (match_operand:SI 3 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (match_operand:SI 4 "const_0_to_1_operand" "n")] + UNSPEC_GATHER_PREFETCH)] + "TARGET_AVX512PF" +{ + switch (INTVAL (operands[4])) + { + case 0: + return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + case 1: + return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "*avx512pf_gatherpf<mode>" + [(unspec + [(const_int -1) + (match_operator:<ssescalarmode> 4 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 1 "vsib_address_operand" "p") + (match_operand:VI48_512 0 "register_operand" "v") + (match_operand:SI 2 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (match_operand:SI 3 "const_0_to_1_operand" "n")] + UNSPEC_GATHER_PREFETCH)] + "TARGET_AVX512PF" +{ + switch (INTVAL (operands[3])) + { + case 0: + return "vgatherpf0<ssemodesuffix>ps\t{%4|%4}"; + case 1: + return "vgatherpf1<ssemodesuffix>ps\t{%4|%4}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_expand "avx512pf_scatterpf<mode>" + [(unspec + [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand") + (mem:<ssescalarmode> + (match_par_dup 5 + [(match_operand 2 "vsib_address_operand") + (match_operand:VI48_512 1 "register_operand") + (match_operand:SI 3 "const1248_operand")])) + (match_operand:SI 4 "const_0_to_1_operand")] + UNSPEC_SCATTER_PREFETCH)] + "TARGET_AVX512PF" +{ + operands[5] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1], + operands[3]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512pf_scatterpf<mode>_mask" + [(unspec + [(match_operand:<avx512fmaskmode> 0 "register_operand" "k") + (match_operator:<ssescalarmode> 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand:VI48_512 1 "register_operand" "v") + (match_operand:SI 3 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (match_operand:SI 4 "const_0_to_1_operand" "n")] + UNSPEC_SCATTER_PREFETCH)] + "TARGET_AVX512PF" +{ + switch (INTVAL (operands[4])) + { + case 0: + return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + case 1: + return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "*avx512pf_scatterpf<mode>" + [(unspec + [(const_int -1) + (match_operator:<ssescalarmode> 4 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 1 "vsib_address_operand" "p") + (match_operand:VI48_512 0 "register_operand" "v") + (match_operand:SI 2 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (match_operand:SI 3 "const_0_to_1_operand" "n")] + UNSPEC_SCATTER_PREFETCH)] + "TARGET_AVX512PF" +{ + switch (INTVAL (operands[3])) + { + case 0: + return "vscatterpf0<ssemodesuffix>ps\t{%4|%4}"; + case 1: + return "vscatterpf1<ssemodesuffix>ps\t{%4|%4}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "*avx512er_exp2<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_EXP2))] + "TARGET_AVX512ER" + "vexp2<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512er_rcp28<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_RCP28))] + "TARGET_AVX512ER" + "vrcp28<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512er_rsqrt28<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm")] + UNSPEC_RSQRT28))] + "TARGET_AVX512ER" + "vrsqrt28<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; XOP instructions @@ -10055,6 +11230,13 @@ }) (define_expand "vlshr<mode>3" + [(set (match_operand:VI48_512 0 "register_operand") + (lshiftrt:VI48_512 + (match_operand:VI48_512 1 "register_operand") + (match_operand:VI48_512 2 "nonimmediate_operand")))] + "TARGET_AVX512F") + +(define_expand "vlshr<mode>3" [(set (match_operand:VI48_256 0 "register_operand") (lshiftrt:VI48_256 (match_operand:VI48_256 1 "register_operand") @@ -10122,6 +11304,13 @@ }) (define_expand "vashl<mode>3" + [(set (match_operand:VI48_512 0 "register_operand") + (ashift:VI48_512 + (match_operand:VI48_512 1 "register_operand") + (match_operand:VI48_512 2 "nonimmediate_operand")))] + "TARGET_AVX512F") + +(define_expand "vashl<mode>3" [(set (match_operand:VI48_256 0 "register_operand") (ashift:VI48_256 (match_operand:VI48_256 1 "register_operand") @@ -10341,10 +11530,10 @@ (set_attr "mode" "TI")]) (define_insn "xop_vpermil2<mode>3" - [(set (match_operand:VF 0 "register_operand" "=x") - (unspec:VF - [(match_operand:VF 1 "register_operand" "x") - (match_operand:VF 2 "nonimmediate_operand" "%x") + [(set (match_operand:VF_128_256 0 "register_operand" "=x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "x") + (match_operand:VF_128_256 2 "nonimmediate_operand" "%x") (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "xm") (match_operand:SI 4 "const_0_to_3_operand" "n")] UNSPEC_VPERMIL2))] @@ -10506,17 +11695,11 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) -(define_mode_attr AVXTOSSEMODE - [(V4DI "V2DI") (V2DI "V2DI") - (V8SI "V4SI") (V4SI "V4SI") - (V16HI "V8HI") (V8HI "V8HI") - (V32QI "V16QI") (V16QI "V16QI")]) - (define_insn "avx2_pbroadcast<mode>" [(set (match_operand:VI 0 "register_operand" "=x") (vec_duplicate:VI (vec_select:<ssescalarmode> - (match_operand:<AVXTOSSEMODE> 1 "nonimmediate_operand" "xm") + (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm") (parallel [(const_int 0)]))))] "TARGET_AVX2" "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" @@ -10540,37 +11723,37 @@ (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_permvar<mode>" - [(set (match_operand:VI4F_256 0 "register_operand" "=v") - (unspec:VI4F_256 - [(match_operand:VI4F_256 1 "nonimmediate_operand" "vm") - (match_operand:V8SI 2 "register_operand" "v")] +(define_insn "<avx2_avx512f>_permvar<mode>" + [(set (match_operand:VI48F_256_512 0 "register_operand" "=v") + (unspec:VI48F_256_512 + [(match_operand:VI48F_256_512 1 "nonimmediate_operand" "vm") + (match_operand:<sseintvecmode> 2 "register_operand" "v")] UNSPEC_VPERMVAR))] "TARGET_AVX2" "vperm<ssemodesuffix>\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_expand "avx2_perm<mode>" - [(match_operand:VI8F_256 0 "register_operand") - (match_operand:VI8F_256 1 "nonimmediate_operand") +(define_expand "<avx2_avx512f>_perm<mode>" + [(match_operand:VI8F_256_512 0 "register_operand") + (match_operand:VI8F_256_512 1 "nonimmediate_operand") (match_operand:SI 2 "const_0_to_255_operand")] "TARGET_AVX2" { int mask = INTVAL (operands[2]); - emit_insn (gen_avx2_perm<mode>_1 (operands[0], operands[1], - GEN_INT ((mask >> 0) & 3), - GEN_INT ((mask >> 2) & 3), - GEN_INT ((mask >> 4) & 3), - GEN_INT ((mask >> 6) & 3))); + emit_insn (gen_<avx2_avx512f>_perm<mode>_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); DONE; }) -(define_insn "avx2_perm<mode>_1" - [(set (match_operand:VI8F_256 0 "register_operand" "=v") - (vec_select:VI8F_256 - (match_operand:VI8F_256 1 "nonimmediate_operand" "vm") +(define_insn "<avx2_avx512f>_perm<mode>_1" + [(set (match_operand:VI8F_256_512 0 "register_operand" "=v") + (vec_select:VI8F_256_512 + (match_operand:VI8F_256_512 1 "nonimmediate_operand" "vm") (parallel [(match_operand 2 "const_0_to_3_operand") (match_operand 3 "const_0_to_3_operand") (match_operand 4 "const_0_to_3_operand") @@ -10633,6 +11816,62 @@ (set_attr "isa" "*,avx2,noavx2") (set_attr "mode" "V8SF")]) +(define_insn "avx512f_vec_dup<mode>" + [(set (match_operand:VI48F_512 0 "register_operand" "=v") + (vec_duplicate:VI48F_512 + (vec_select:<ssescalarmode> + (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "vm") + (parallel [(const_int 0)]))))] + "TARGET_AVX512F" + "v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_broadcast<mode>" + [(set (match_operand:V16FI 0 "register_operand" "=v,v") + (vec_duplicate:V16FI + (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "v,m")))] + "TARGET_AVX512F" + "@ + vshuf<shuffletype>32x4\t{$0x0, %g1, %g1, %0|%0, %g1, %g1, 0x0} + vbroadcast<shuffletype>32x4\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_broadcast<mode>" + [(set (match_operand:V8FI 0 "register_operand" "=v,v") + (vec_duplicate:V8FI + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "v,m")))] + "TARGET_AVX512F" + "@ + vshuf<shuffletype>64x2\t{$0x44, %g1, %g1, %0|%0, %g1, %g1, 0x44} + vbroadcast<shuffletype>64x4\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_vec_dup_gpr<mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (vec_duplicate:VI48_512 + (match_operand:<ssescalarmode> 1 "register_operand" "r")))] + "TARGET_AVX512F && (<MODE>mode != V8DImode || TARGET_64BIT)" + "vpbroadcast<bcstscalarsuff>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_vec_dup_mem<mode>" + [(set (match_operand:VI48F_512 0 "register_operand" "=v") + (vec_duplicate:VI48F_512 + (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F" + "v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "avx2_vbroadcasti128_<mode>" [(set (match_operand:VI_256 0 "register_operand" "=x") (vec_concat:VI_256 @@ -10746,7 +11985,7 @@ elt * GET_MODE_SIZE (<ssescalarmode>mode)); }) -(define_expand "avx_vpermil<mode>" +(define_expand "<sse2_avx_avx512f>_vpermil<mode>" [(set (match_operand:VF2 0 "register_operand") (vec_select:VF2 (match_operand:VF2 1 "nonimmediate_operand") @@ -10756,19 +11995,18 @@ int mask = INTVAL (operands[2]); rtx perm[<ssescalarnum>]; - perm[0] = GEN_INT (mask & 1); - perm[1] = GEN_INT ((mask >> 1) & 1); - if (<MODE>mode == V4DFmode) + int i; + for (i = 0; i < <ssescalarnum>; i = i + 2) { - perm[2] = GEN_INT (((mask >> 2) & 1) + 2); - perm[3] = GEN_INT (((mask >> 3) & 1) + 2); + perm[i] = GEN_INT (((mask >> i) & 1) + i); + perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i); } operands[2] = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); }) -(define_expand "avx_vpermil<mode>" +(define_expand "<sse2_avx_avx512f>_vpermil<mode>" [(set (match_operand:VF1 0 "register_operand") (vec_select:VF1 (match_operand:VF1 1 "nonimmediate_operand") @@ -10778,23 +12016,20 @@ int mask = INTVAL (operands[2]); rtx perm[<ssescalarnum>]; - perm[0] = GEN_INT (mask & 3); - perm[1] = GEN_INT ((mask >> 2) & 3); - perm[2] = GEN_INT ((mask >> 4) & 3); - perm[3] = GEN_INT ((mask >> 6) & 3); - if (<MODE>mode == V8SFmode) + int i; + for (i = 0; i < <ssescalarnum>; i = i + 4) { - perm[4] = GEN_INT ((mask & 3) + 4); - perm[5] = GEN_INT (((mask >> 2) & 3) + 4); - perm[6] = GEN_INT (((mask >> 4) & 3) + 4); - perm[7] = GEN_INT (((mask >> 6) & 3) + 4); + perm[i] = GEN_INT (((mask >> 0) & 3) + i); + perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i); + perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i); + perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i); } operands[2] = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); }) -(define_insn "*avx_vpermilp<mode>" +(define_insn "*<sse2_avx_avx512f>_vpermilp<mode>" [(set (match_operand:VF 0 "register_operand" "=v") (vec_select:VF (match_operand:VF 1 "nonimmediate_operand" "vm") @@ -10811,9 +12046,9 @@ (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "vex") - (set_attr "mode" "<MODE>")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx_vpermilvar<mode>3" +(define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF [(match_operand:VF 1 "register_operand" "v") @@ -10823,9 +12058,35 @@ "vpermil<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") (set_attr "btver2_decode" "vector") - (set_attr "mode" "<MODE>")]) + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_vpermi2var<mode>3" + [(set (match_operand:VI48F_512 0 "register_operand" "=v") + (unspec:VI48F_512 + [(match_operand:VI48F_512 1 "register_operand" "v") + (match_operand:<sseintvecmode> 2 "register_operand" "0") + (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")] + UNSPEC_VPERMI2))] + "TARGET_AVX512F" + "vpermi2<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_vpermt2var<mode>3" + [(set (match_operand:VI48F_512 0 "register_operand" "=v") + (unspec:VI48F_512 + [(match_operand:<sseintvecmode> 1 "register_operand" "v") + (match_operand:VI48F_512 2 "register_operand" "0") + (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")] + UNSPEC_VPERMT2))] + "TARGET_AVX512F" + "vpermt2<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) (define_expand "avx_vperm2f128<mode>3" [(set (match_operand:AVX256MODE2P 0 "register_operand") @@ -11159,6 +12420,15 @@ DONE; }) +(define_expand "vec_init<mode>" + [(match_operand:VI48F_512 0 "register_operand") + (match_operand 1)] + "TARGET_AVX512F" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + (define_expand "avx2_extracti128" [(match_operand:V2DI 0 "nonimmediate_operand") (match_operand:V4DI 1 "register_operand") @@ -11208,31 +12478,36 @@ DONE; }) -(define_insn "avx2_ashrv<mode>" - [(set (match_operand:VI4_AVX2 0 "register_operand" "=v") - (ashiftrt:VI4_AVX2 - (match_operand:VI4_AVX2 1 "register_operand" "v") - (match_operand:VI4_AVX2 2 "nonimmediate_operand" "vm")))] +(define_insn "<avx2_avx512f>_ashrv<mode>" + [(set (match_operand:VI48_AVX512F 0 "register_operand" "=v") + (ashiftrt:VI48_AVX512F + (match_operand:VI48_AVX512F 1 "register_operand" "v") + (match_operand:VI48_AVX512F 2 "nonimmediate_operand" "vm")))] "TARGET_AVX2" - "vpsravd\t{%2, %1, %0|%0, %1, %2}" + "vpsrav<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") + (set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_<shift_insn>v<mode>" - [(set (match_operand:VI48_AVX2 0 "register_operand" "=v") - (any_lshift:VI48_AVX2 - (match_operand:VI48_AVX2 1 "register_operand" "v") - (match_operand:VI48_AVX2 2 "nonimmediate_operand" "vm")))] +(define_insn "<avx2_avx512f>_<shift_insn>v<mode>" + [(set (match_operand:VI48_AVX2_48_AVX512F 0 "register_operand" "=v") + (any_lshift:VI48_AVX2_48_AVX512F + (match_operand:VI48_AVX2_48_AVX512F 1 "register_operand" "v") + (match_operand:VI48_AVX2_48_AVX512F 2 "nonimmediate_operand" "vm")))] "TARGET_AVX2" "vp<vshift>v<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") + (set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseinsnmode>")]) +;; For avx_vec_concat<mode> insn pattern +(define_mode_attr concat_tg_mode + [(V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t") + (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")]) + (define_insn "avx_vec_concat<mode>" - [(set (match_operand:V_256 0 "register_operand" "=x,x") - (vec_concat:V_256 + [(set (match_operand:V_256_512 0 "register_operand" "=x,x") + (vec_concat:V_256_512 (match_operand:<ssehalfvecmode> 1 "register_operand" "x,x") (match_operand:<ssehalfvecmode> 2 "vector_move_operand" "xm,C")))] "TARGET_AVX" @@ -11240,16 +12515,24 @@ switch (which_alternative) { case 0: - return "vinsert<i128>\t{$0x1, %2, %t1, %0|%0, %t1, %2, 0x1}"; + return "vinsert<i128>\t{$0x1, %2, %<concat_tg_mode>1, %0|%0, %<concat_tg_mode>1, %2, 0x1}"; case 1: switch (get_attr_mode (insn)) { + case MODE_V16SF: + return "vmovaps\t{%1, %t0|%t0, %1}"; + case MODE_V8DF: + return "vmovapd\t{%1, %t0|%t0, %1}"; case MODE_V8SF: return "vmovaps\t{%1, %x0|%x0, %1}"; case MODE_V4DF: return "vmovapd\t{%1, %x0|%x0, %1}"; - default: + case MODE_XI: + return "vmovdqa\t{%1, %t0|%t0, %1}"; + case MODE_OI: return "vmovdqa\t{%1, %x0|%x0, %1}"; + default: + gcc_unreachable (); } default: gcc_unreachable (); @@ -11258,7 +12541,7 @@ [(set_attr "type" "sselog,ssemov") (set_attr "prefix_extra" "1,*") (set_attr "length_immediate" "1,*") - (set_attr "prefix" "vex") + (set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "vcvtph2ps" @@ -11345,20 +12628,22 @@ (define_mode_iterator VEC_GATHER_MODE [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) (define_mode_attr VEC_GATHER_IDXSI - [(V2DI "V4SI") (V2DF "V4SI") - (V4DI "V4SI") (V4DF "V4SI") - (V4SI "V4SI") (V4SF "V4SI") - (V8SI "V8SI") (V8SF "V8SI")]) + [(V2DI "V4SI") (V4DI "V4SI") (V8DI "V8SI") + (V2DF "V4SI") (V4DF "V4SI") (V8DF "V8SI") + (V4SI "V4SI") (V8SI "V8SI") (V16SI "V16SI") + (V4SF "V4SI") (V8SF "V8SI") (V16SF "V16SI")]) + (define_mode_attr VEC_GATHER_IDXDI - [(V2DI "V2DI") (V2DF "V2DI") - (V4DI "V4DI") (V4DF "V4DI") - (V4SI "V2DI") (V4SF "V2DI") - (V8SI "V4DI") (V8SF "V4DI")]) + [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI") + (V2DF "V2DI") (V4DF "V4DI") (V8DF "V8DI") + (V4SI "V2DI") (V8SI "V4DI") (V16SI "V8DI") + (V4SF "V2DI") (V8SF "V4DI") (V16SF "V8DI")]) + (define_mode_attr VEC_GATHER_SRCDI - [(V2DI "V2DI") (V2DF "V2DF") - (V4DI "V4DI") (V4DF "V4DF") - (V4SI "V4SI") (V4SF "V4SF") - (V8SI "V4SI") (V8SF "V4SF")]) + [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI") + (V2DF "V2DF") (V4DF "V4DF") (V8DF "V8DF") + (V4SI "V4SI") (V8SI "V4SI") (V16SI "V8SI") + (V4SF "V4SF") (V8SF "V4SF") (V16SF "V8SF")]) (define_expand "avx2_gathersi<mode>" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand") @@ -11532,3 +12817,241 @@ [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512f_gathersi<mode>" + [(parallel [(set (match_operand:VI48F_512 0 "register_operand") + (unspec:VI48F_512 + [(match_operand:VI48F_512 1 "register_operand") + (match_operand:<avx512fmaskmode> 4 "register_operand") + (mem:<ssescalarmode> + (match_par_dup 6 + [(match_operand 2 "vsib_address_operand") + (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand") + (match_operand:SI 5 "const1248_operand")]))] + UNSPEC_GATHER)) + (clobber (match_scratch:<avx512fmaskmode> 7))])] + "TARGET_AVX512F" +{ + operands[6] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], + operands[5]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512f_gathersi<mode>" + [(set (match_operand:VI48F_512 0 "register_operand" "=&v") + (unspec:VI48F_512 + [(match_operand:VI48F_512 1 "register_operand" "0") + (match_operand:<avx512fmaskmode> 7 "register_operand" "2") + (match_operator:<ssescalarmode> 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 4 "vsib_address_operand" "p") + (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "v") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)])] + UNSPEC_GATHER)) + (clobber (match_scratch:<avx512fmaskmode> 2 "=&k"))] + "TARGET_AVX512F" + "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %g6}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512f_gathersi<mode>_2" + [(set (match_operand:VI48F_512 0 "register_operand" "=&v") + (unspec:VI48F_512 + [(pc) + (match_operand:<avx512fmaskmode> 6 "register_operand" "1") + (match_operator:<ssescalarmode> 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v") + (match_operand:SI 4 "const1248_operand" "n")] + UNSPEC_VSIBADDR)])] + UNSPEC_GATHER)) + (clobber (match_scratch:<avx512fmaskmode> 1 "=&k"))] + "TARGET_AVX512F" + "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + + +(define_expand "avx512f_gatherdi<mode>" + [(parallel [(set (match_operand:VI48F_512 0 "register_operand") + (unspec:VI48F_512 + [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand") + (match_operand:QI 4 "register_operand") + (mem:<ssescalarmode> + (match_par_dup 6 + [(match_operand 2 "vsib_address_operand") + (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand") + (match_operand:SI 5 "const1248_operand")]))] + UNSPEC_GATHER)) + (clobber (match_scratch:QI 7))])] + "TARGET_AVX512F" +{ + operands[6] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], + operands[5]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512f_gatherdi<mode>" + [(set (match_operand:VI48F_512 0 "register_operand" "=&v") + (unspec:VI48F_512 + [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "0") + (match_operand:QI 7 "register_operand" "2") + (match_operator:<ssescalarmode> 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 4 "vsib_address_operand" "p") + (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "v") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)])] + UNSPEC_GATHER)) + (clobber (match_scratch:QI 2 "=&k"))] + "TARGET_AVX512F" + "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %g6}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512f_gatherdi<mode>_2" + [(set (match_operand:VI48F_512 0 "register_operand" "=&v") + (unspec:VI48F_512 + [(pc) + (match_operand:QI 6 "register_operand" "1") + (match_operator:<ssescalarmode> 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand:<VEC_GATHER_IDXDI> 2 "register_operand" "v") + (match_operand:SI 4 "const1248_operand" "n")] + UNSPEC_VSIBADDR)])] + UNSPEC_GATHER)) + (clobber (match_scratch:QI 1 "=&k"))] + "TARGET_AVX512F" +{ + if (<MODE>mode != <VEC_GATHER_SRCDI>mode) + return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %g5}"; + return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}"; +} + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512f_scattersi<mode>" + [(parallel [(set (mem:VI48F_512 + (match_par_dup 5 + [(match_operand 0 "vsib_address_operand") + (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand") + (match_operand:SI 4 "const1248_operand")])) + (unspec:VI48F_512 + [(match_operand:<avx512fmaskmode> 1 "register_operand") + (match_operand:VI48F_512 3 "register_operand")] + UNSPEC_SCATTER)) + (clobber (match_scratch:<avx512fmaskmode> 6))])] + "TARGET_AVX512F" +{ + operands[5] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2], + operands[4]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512f_scattersi<mode>" + [(set (match_operator:VI48F_512 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 0 "vsib_address_operand" "p") + (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v") + (match_operand:SI 4 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (unspec:VI48F_512 + [(match_operand:<avx512fmaskmode> 6 "register_operand" "1") + (match_operand:VI48F_512 3 "register_operand" "v")] + UNSPEC_SCATTER)) + (clobber (match_scratch:<avx512fmaskmode> 1 "=&k"))] + "TARGET_AVX512F" + "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512f_scatterdi<mode>" + [(parallel [(set (mem:VI48F_512 + (match_par_dup 5 + [(match_operand 0 "vsib_address_operand") + (match_operand:V8DI 2 "register_operand") + (match_operand:SI 4 "const1248_operand")])) + (unspec:VI48F_512 + [(match_operand:QI 1 "register_operand") + (match_operand:<VEC_GATHER_SRCDI> 3 "register_operand")] + UNSPEC_SCATTER)) + (clobber (match_scratch:QI 6))])] + "TARGET_AVX512F" +{ + operands[5] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2], + operands[4]), UNSPEC_VSIBADDR); +}) + +(define_insn "*avx512f_scatterdi<mode>" + [(set (match_operator:VI48F_512 5 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 0 "vsib_address_operand" "p") + (match_operand:V8DI 2 "register_operand" "v") + (match_operand:SI 4 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (unspec:VI48F_512 + [(match_operand:QI 6 "register_operand" "1") + (match_operand:<VEC_GATHER_SRCDI> 3 "register_operand" "v")] + UNSPEC_SCATTER)) + (clobber (match_scratch:QI 1 "=&k"))] + "TARGET_AVX512F" + "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512f_getmant<mode>" + [(set (match_operand:VF_512 0 "register_operand" "=v") + (unspec:VF_512 + [(match_operand:VF_512 1 "nonimmediate_operand" "vm") + (match_operand:SI 2 "const_0_to_15_operand")] + UNSPEC_GETMANT))] + "TARGET_AVX512F" + "vgetmant<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_getmant<mode>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (unspec:VF_128 + [(match_operand:VF_128 1 "register_operand" "v") + (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_15_operand")] + UNSPEC_GETMANT) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "vgetmant<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "clz<mode>2" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (clz:VI48_512 + (match_operand:VI48_512 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512CD" + "vplzcnt<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "conflict<mode>" + [(set (match_operand:VI48_512 0 "register_operand" "=v") + (unspec:VI48_512 + [(match_operand:VI48_512 1 "nonimmediate_operand" "vm")] + UNSPEC_CONFLICT))] + "TARGET_AVX512CD" + "vpconflict<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 07624cc575e..1a76c4152f6 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -16,22 +16,9 @@ # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. -i386.o: $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h $(TM_H) \ - $(RTL_H) $(TREE_H) $(TM_P_H) $(REGS_H) hard-reg-set.h \ - $(REAL_H) insn-config.h conditions.h output.h insn-codes.h \ - $(INSN_ATTR_H) $(FLAGS_H) $(C_COMMON_H) except.h $(FUNCTION_H) \ - $(RECOG_H) $(EXPR_H) $(OPTABS_H) toplev.h $(BASIC_BLOCK_H) \ - $(GGC_H) $(TARGET_H) $(TARGET_DEF_H) langhooks.h $(CGRAPH_H) \ - $(TREE_GIMPLE_H) $(DWARF2_H) $(DF_H) tm-constrs.h $(PARAMS_H) \ - i386-builtin-types.inc debug.h dwarf2out.h sbitmap.h $(FIBHEAP_H) \ - $(OPTS_H) $(DIAGNOSTIC_H) $(COMMON_TARGET_H) $(CONTEXT_H) $(PASS_MANAGER_H) - -i386-c.o: $(srcdir)/config/i386/i386-c.c \ - $(srcdir)/config/i386/i386-protos.h $(CONFIG_H) $(SYSTEM_H) coretypes.h \ - $(TM_H) $(TREE_H) $(TM_P_H) $(FLAGS_H) $(C_COMMON_H) $(GGC_H) \ - $(TARGET_H) $(TARGET_DEF_H) $(CPPLIB_H) $(C_PRAGMA_H) - $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ - $(srcdir)/config/i386/i386-c.c +i386-c.o: $(srcdir)/config/i386/i386-c.c i386-builtin-types.inc + $(COMPILE) $< + $(POSTCOMPILE) i386-builtin-types.inc: s-i386-bt ; @true diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index 8cf6b3c527a..58e95a3790b 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -1178,10 +1178,10 @@ i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx insn) for (note = REG_NOTES (insn); note ; note = XEXP (note, 1)) { - pat = XEXP (note, 0); switch (REG_NOTE_KIND (note)) { case REG_FRAME_RELATED_EXPR: + pat = XEXP (note, 0); goto found; case REG_CFA_DEF_CFA: @@ -1195,6 +1195,7 @@ i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx insn) gcc_unreachable (); case REG_CFA_ADJUST_CFA: + pat = XEXP (note, 0); if (pat == NULL) { pat = PATTERN (insn); @@ -1206,6 +1207,7 @@ i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx insn) break; case REG_CFA_OFFSET: + pat = XEXP (note, 0); if (pat == NULL) pat = single_set (insn); seh_cfa_offset (asm_out_file, seh, pat); diff --git a/gcc/config/i386/x-darwin b/gcc/config/i386/x-darwin index f0196bac41d..4967d695ce9 100644 --- a/gcc/config/i386/x-darwin +++ b/gcc/config/i386/x-darwin @@ -1,4 +1,3 @@ -host-i386-darwin.o : $(srcdir)/config/i386/host-i386-darwin.c \ - $(CONFIG_H) $(SYSTEM_H) coretypes.h hosthooks.h $(HOSTHOOKS_DEF_H) \ - config/host-darwin.h - $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< +host-i386-darwin.o : $(srcdir)/config/i386/host-i386-darwin.c + $(COMPILE) $< + $(POSTCOMPILE) diff --git a/gcc/config/i386/x-i386 b/gcc/config/i386/x-i386 index 2bf8fed5db5..1f3db1d19cf 100644 --- a/gcc/config/i386/x-i386 +++ b/gcc/config/i386/x-i386 @@ -1,4 +1,3 @@ -driver-i386.o : $(srcdir)/config/i386/driver-i386.c \ - $(srcdir)/config/i386/cpuid.h \ - $(CONFIG_H) $(SYSTEM_H) $(TM_H) coretypes.h - $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< +driver-i386.o : $(srcdir)/config/i386/driver-i386.c + $(COMPILE) $< + $(POSTCOMPILE) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index fc19df19d79..4d238af212d 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -18,213 +18,412 @@ a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results - negatively, so enabling for Generic64 seems like good code size - tradeoff. We can't enable it for 32bit generic because it does not - work well with PPro base chips. */ +/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. + Some chips, like 486 and Pentium works faster with separate load + and push instructions. */ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead + of mozbl/movwl. */ DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | m_PENT) + +/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for + inline strlen. This affects only -minline-all-stringops mode. By + default we always dispatch to a library since our internal strlen + is bad. */ DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC) + /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based on simulation result. But after P4 was made, no performance benefit was observed with branch hints. It also increases the code size. As a result, icc never generates branch hints. */ DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0) + +/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in + an integer register. */ DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386) + +/* X86_TUNE_USE_SAHF: Controls use of SAHF. */ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC) + /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid partial dependencies. */ DEF_TUNE (X86_TUNE_MOVX, "movx", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) -/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial - register stalls on Generic32 compilation setting as well. However - in current implementation the partial register stalls are not eliminated + +/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled + use of partial registers by renaming. This improved performance of 16bit + code where upper halves of registers are not used. It also leads to + an penalty whenever a 16bit store is followed by 32bit use. This flag + disables production of such sequences in common cases. + See also X86_TUNE_HIMODE_MATH. + + In current implementation the partial register stalls are not eliminated very well - they can be introduced via subregs synthesized by combine and can happen in caller/callee saving sequences. */ DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) + +/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags + set by instructions affecting just some flags (in particular shifts). + This is because Core2 resolves dependencies on whole flags register + and such sequences introduce false dependency on previous instruction + setting full flags. + + The flags does not affect generation of INC and DEC that is controlled + by X86_TUNE_USE_INCDEC. + + This flag may be dropped from generic once core2-corei5 machines are + rare enough. */ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", - m_CORE_ALL | m_GENERIC) + m_CORE2 | m_GENERIC) + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall - * on 16-bit immediate moves into memory on Core2 and Corei7. */ + on 16-bit immediate moves into memory on Core2 and Corei7. */ DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) + +/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit + integer operand. + FIXME: Why this is disabled for modern chips? */ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", m_386 | m_486 | m_K6_GEODE) + +/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit + integer operand. */ DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)) + +/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear + integer register. */ DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6) + +/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6)) + /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4) + +/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates + directly to memory. */ DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO) + +/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions + such as "add $1, mem". */ DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT) + +/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such + as "add mem, reg". */ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) + +/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to + corresponding 32bit arithmetic. */ DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode", m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic + into 16bit/8bit when resulting sequence is shorter. For example + for "and $-65536, reg" to 16bit store of 0. */ DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT)) + +/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such + as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) + +/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0) -/* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial - register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option - might be considered for Generic32 if our scheme for avoiding partial - stalls was more effective. */ + +/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic. + On PPro this flag is meant to avoid partial register stalls. */ DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO) + +/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit + arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme + is usually used for RISC targets. */ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0) + +/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid + partial register stalls on PentiumPro targets. */ DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO) + /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred over esp addition. */ DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO) + /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred over esp addition. */ DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) + /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred over esp subtraction. */ DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT | m_K6_GEODE) + /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred over esp subtraction. */ DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE) + /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred for DFmode copies */ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) + +/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming + on modern chips. Preffer stores affecting whole integer register + over partial stores. For example preffer MOVZBL or MOVQ to load 8bit + value over movb. */ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) -/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a - conflict here in between PPro/Pentium4 based chips that thread 128bit - SSE registers as single units versus K8 based chips that divide SSE - registers to two 64bit halves. This knob promotes all store destinations - to be 128bit to allow register renaming on 128bit SSE units, but usually - results in one extra microop on 64bit SSE units. Experimental results - shows that disabling this option on P4 brings over 20% SPECfp regression, - while enabling it on K8 brings roughly 2.4% regression that can be partly - masked by careful scheduling of moves. */ + +/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store + destinations to be 128bit to allow register renaming on 128bit SSE units, + but usually results in one extra microop on 64bit SSE units. + Experimental results shows that disabling this option on P4 brings over 20% + SPECfp regression, while enabling it on K8 brings roughly 2.4% regression + that can be partly masked by careful scheduling of moves. */ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 | m_BDVER | m_GENERIC) + +/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead + of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", - m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM) + m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM | m_GENERIC) + +/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead + of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", - m_COREI7 | m_BDVER | m_SLM) + m_COREI7 | m_BDVER | m_SLM | m_GENERIC) + +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_GENERIC)) + +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_BDVER | m_GENERIC)) + +/* Use packed single precision instructions where posisble. I.e. movups instead + of movupd. */ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", m_BDVER) + /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may maintain just lower part of scalar values in proper format leaving the upper part undefined. */ DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) -DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", m_AMD_MULTIPLE) -DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | m_P4_NOCONA) + +/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ +DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", + m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) + +/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to + xorps/xorpd and other variants. */ +DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC) + +/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by + full sized loads. */ DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are + considered on critical path. */ DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", m_PPRO | m_ATHLON_K8) + +/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are + considered on critical path. */ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", m_PPRO | m_ATHLON_K8) + +/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of + longer "sal $1, reg". */ DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) + +/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) + +/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer + to SSE registers. If disabled, the moves will be done by storing + the value to memory and reloading. */ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec", ~(m_AMD_MULTIPLE | m_GENERIC)) + +/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE + to integer registers. If disabled, the moves will be done by storing + the value to memory and reloading. */ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec", ~m_ATHLON_K8) + +/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions + to use both SSE and integer registers at a same time. + FIXME: revisit importance of this for generic. */ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", - ~(m_AMDFAM10 | m_BDVER )) + ~(m_AMDFAM10 | m_BDVER)) + /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| m_AMD_MULTIPLE - | m_GENERIC) + m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_ATHLON_K8 | m_AMDFAM10) + +/* X86_TUNE_SCHEDULE: Enable scheduling. */ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) + +/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination + of conditional jump or directly preceded by other jump instruction. + This is important for AND K8-AMDFAM10 because the branch prediction + architecture expect at most one jump per 2 byte window. Failing to + pad returns leads to misaligned return stack. */ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", - m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) + m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC) + +/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 + instructions long. */ DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) + +/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) + +/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded + forms of instructions on K8 targets. */ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", - m_CORE_ALL | m_K8 | m_GENERIC) + m_K8) + /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode and SImode multiply, but 386 and 486 do HImode multiply faster. */ DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", ~(m_386 | m_486)) + /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is - vector path on AMD machines. */ + vector path on AMD machines. + FIXME: Do we need to enable this for core? */ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", - m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC) + m_K8 | m_AMDFAM10) + /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD - machines. */ + machines. + FIXME: Do we need to enable this for core? */ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", - m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC) + m_K8 | m_AMDFAM10) + /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR than a MOV. */ DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT) + /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, but one byte longer. */ DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT) + /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory operand that cannot be represented using a modRM byte. The XOR replacement is long decoded, so this split helps here as well. */ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) + /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion - from FP to FP. */ + from FP to FP. This form of instructions avoids partial write to the + destination. */ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", - m_CORE_ALL | m_AMDFAM10 | m_GENERIC) + m_AMDFAM10) + /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion from integer to FP. */ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) + /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction with a subsequent conditional jump instruction into a single - compare-and-branch uop. */ -DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER) + compare-and-branch uop. + FIXME: revisit for generic. */ +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | m_CORE_ALL) + /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) + /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector instructions. */ DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM) + /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching at -O3. For the moment, the prefetching seems badly tuned for Intel chips. */ DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial", m_K6_GEODE | m_AMD_MULTIPLE) + /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2) + /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations during reassociation of integer computation. */ DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", m_ATOM) + /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations during reassociation of fp computation. */ DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", - m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2) + m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC) + /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE regs instead of memory. */ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", m_CORE_ALL) + /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", m_ATOM | m_SLM) + /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for fp converts to destination register. */ DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", m_SLM) + +/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing + arguments in prologue/epilogue instead of separately for each call + by push/pop instructions. + This increase code size by about 5% in 32bit mode, less so in 64bit mode + because parameters are passed in registers. It is considerable + win for targets without stack engine that prevents multple push operations + to happen in parallel. + + FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, + Bobcat and Generic. This is because disabling it causes large + regression on mgrid due to IRA limitation leading to unecessary + use of the frame pointer in 32bit mode. */ +DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", + m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, + such as fsqrt, fprem, fsin, fcos, fsincos etc. + Should be enabled for all targets that always has coprocesor. */ +DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", + ~(m_386 | m_486)) |