1 files changed, 960 insertions, 57 deletions
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index a5e014af5e..d12ca22ef7 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -86,6 +86,7 @@
 #define QWORD_MNEM_SUFFIX  'q'
 #define XMMWORD_MNEM_SUFFIX  'x'
 #define YMMWORD_MNEM_SUFFIX 'y'
+#define ZMMWORD_MNEM_SUFFIX 'z'
 /* Intel Syntax.  Use a non-ascii letter since since it never appears
    in instructions.  */
 #define LONG_DOUBLE_MNEM_SUFFIX '\1'
@@ -189,11 +190,53 @@ static void handle_large_common (int small ATTRIBUTE_UNUSED);
 
 static const char *default_arch = DEFAULT_ARCH;
 
+/* This struct describes rounding control and SAE in the instruction.  */
+struct RC_Operation
+{
+  enum rc_type
+    {
+      rne = 0,
+      rd,
+      ru,
+      rz,
+      saeonly
+    } type;
+  int operand;
+};
+
+static struct RC_Operation rc_op;
+
+/* The struct describes masking, applied to OPERAND in the instruction.
+   MASK is a pointer to the corresponding mask register.  ZEROING tells
+   whether merging or zeroing mask is used.  */
+struct Mask_Operation
+{
+  const reg_entry *mask;
+  unsigned int zeroing;
+  /* The operand where this operation is associated.  */
+  int operand;
+};
+
+static struct Mask_Operation mask_op;
+
+/* The struct describes broadcasting, applied to OPERAND.  FACTOR is
+   broadcast factor.  */
+struct Broadcast_Operation
+{
+  /* Type of broadcast: no broadcast, {1to8}, or {1to16}.  */
+  int type;
+
+  /* Index of broadcasted operand.  */
+  int operand;
+};
+
+static struct Broadcast_Operation broadcast_op;
+
 /* VEX prefix.  */
 typedef struct
 {
-  /* VEX prefix is either 2 byte or 3 byte.  */
-  unsigned char bytes[3];
+  /* VEX prefix is either 2 byte or 3 byte.  EVEX is 4 byte.  */
+  unsigned char bytes[4];
   unsigned int length;
   /* Destination or source register specifier.  */
   const reg_entry *register_specifier;
@@ -223,7 +266,17 @@ enum i386_error
     unsupported,
     invalid_vsib_address,
     invalid_vector_register_set,
-    unsupported_vector_index_register
+    unsupported_vector_index_register,
+    unsupported_broadcast,
+    broadcast_not_on_src_operand,
+    broadcast_needed,
+    unsupported_masking,
+    mask_not_on_destination,
+    no_default_mask,
+    unsupported_rc_sae,
+    rc_sae_operand_not_last_imm,
+    invalid_register_operand,
+    try_vector_disp8
   };
 
 struct _i386_insn
@@ -277,9 +330,22 @@ struct _i386_insn
        addressing modes of this insn are encoded.  */
     modrm_byte rm;
     rex_byte rex;
+    rex_byte vrex;
     sib_byte sib;
     vex_prefix vex;
 
+    /* Masking attributes.  */
+    struct Mask_Operation *mask;
+
+    /* Rounding control and SAE attributes.  */
+    struct RC_Operation *rounding;
+
+    /* Broadcasting attributes.  */
+    struct Broadcast_Operation *broadcast;
+
+    /* Compressed disp8*N attribute.  */
+    unsigned int memshift;
+
     /* Swap operand in encoding.  */
     unsigned int swap_operand;
 
@@ -300,15 +366,36 @@ struct _i386_insn
     /* Have BND prefix.  */
     const char *bnd_prefix;
 
+    /* Need VREX to support upper 16 registers.  */
+    int need_vrex;
+
     /* Error message.  */
     enum i386_error error;
   };
 
 typedef struct _i386_insn i386_insn;
 
+/* Link RC type with corresponding string, that'll be looked for in
+   asm.  */
+struct RC_name
+{
+  enum rc_type type;
+  const char *name;
+  unsigned int len;
+};
+
+static const struct RC_name RC_NamesTable[] =
+{
+  {  rne, STRING_COMMA_LEN ("rn-sae") },
+  {  rd,  STRING_COMMA_LEN ("rd-sae") },
+  {  ru,  STRING_COMMA_LEN ("ru-sae") },
+  {  rz,  STRING_COMMA_LEN ("rz-sae") },
+  {  saeonly,  STRING_COMMA_LEN ("sae") },
+};
+
 /* List of chars besides those in app.c:symbol_chars that can start an
    operand.  Used to prevent the scrubber eating vital white-space.  */
-const char extra_symbol_chars[] = "*%-(["
+const char extra_symbol_chars[] = "*%-([{"
 #ifdef LEX_AT
 	"@"
 #endif
@@ -512,6 +599,21 @@ static enum
     vex256
   } avxscalar;
 
+/* Encode scalar EVEX LIG instructions with specific vector length.  */
+static enum
+  {
+    evexl128 = 0,
+    evexl256,
+    evexl512
+  } evexlig;
+
+/* Encode EVEX WIG instructions with specific evex.w.  */
+static enum
+  {
+    evexw0 = 0,
+    evexw1
+  } evexwig;
+
 /* Pre-defined "_GLOBAL_OFFSET_TABLE_".  */
 static symbolS *GOT_symbol;
 
@@ -706,6 +808,14 @@ static const arch_entry cpu_arch[] =
     CPU_AVX_FLAGS, 0, 0 },
   { STRING_COMMA_LEN (".avx2"), PROCESSOR_UNKNOWN,
     CPU_AVX2_FLAGS, 0, 0 },
+  { STRING_COMMA_LEN (".avx512f"), PROCESSOR_UNKNOWN,
+    CPU_AVX512F_FLAGS, 0, 0 },
+  { STRING_COMMA_LEN (".avx512cd"), PROCESSOR_UNKNOWN,
+    CPU_AVX512CD_FLAGS, 0, 0 },
+  { STRING_COMMA_LEN (".avx512er"), PROCESSOR_UNKNOWN,
+    CPU_AVX512ER_FLAGS, 0, 0 },
+  { STRING_COMMA_LEN (".avx512pf"), PROCESSOR_UNKNOWN,
+    CPU_AVX512PF_FLAGS, 0, 0 },
   { STRING_COMMA_LEN (".noavx"), PROCESSOR_UNKNOWN,
     CPU_ANY_AVX_FLAGS, 0, 1 },
   { STRING_COMMA_LEN (".vmx"), PROCESSOR_UNKNOWN,
@@ -1543,6 +1653,8 @@ static const i386_operand_type anydisp
   = OPERAND_TYPE_ANYDISP;
 static const i386_operand_type regxmm = OPERAND_TYPE_REGXMM;
 static const i386_operand_type regymm = OPERAND_TYPE_REGYMM;
+static const i386_operand_type regzmm = OPERAND_TYPE_REGZMM;
+static const i386_operand_type regmask = OPERAND_TYPE_REGMASK;
 static const i386_operand_type imm8 = OPERAND_TYPE_IMM8;
 static const i386_operand_type imm8s = OPERAND_TYPE_IMM8S;
 static const i386_operand_type imm16 = OPERAND_TYPE_IMM16;
@@ -1554,6 +1666,7 @@ static const i386_operand_type imm16_32s = OPERAND_TYPE_IMM16_32S;
 static const i386_operand_type imm16_32_32s = OPERAND_TYPE_IMM16_32_32S;
 static const i386_operand_type vec_imm4 = OPERAND_TYPE_VEC_IMM4;
 static const i386_operand_type regbnd = OPERAND_TYPE_REGBND;
+static const i386_operand_type vec_disp8 = OPERAND_TYPE_VEC_DISP8;
 
 enum operand_type
 {
@@ -1636,7 +1749,9 @@ match_mem_size (const insn_template *t, unsigned int j)
 	       || (i.types[j].bitfield.xmmword
 		   && !t->operand_types[j].bitfield.xmmword)
 	       || (i.types[j].bitfield.ymmword
-		   && !t->operand_types[j].bitfield.ymmword)));
+		   && !t->operand_types[j].bitfield.ymmword)
+	       || (i.types[j].bitfield.zmmword
+		   && !t->operand_types[j].bitfield.zmmword)));
 }
 
 /* Return 1 if there is no size conflict on any operands for
@@ -1717,6 +1832,7 @@ operand_type_match (i386_operand_type overlap,
   temp.bitfield.tbyte = 0;
   temp.bitfield.xmmword = 0;
   temp.bitfield.ymmword = 0;
+  temp.bitfield.zmmword = 0;
   if (operand_type_all_zero (&temp))
     goto mismatch;
 
@@ -1794,7 +1910,7 @@ register_number (const reg_entry *r)
 static INLINE unsigned int
 mode_from_disp_size (i386_operand_type t)
 {
-  if (t.bitfield.disp8)
+  if (t.bitfield.disp8 || t.bitfield.vec_disp8)
     return 1;
   else if (t.bitfield.disp16
 	   || t.bitfield.disp32
@@ -1850,6 +1966,25 @@ fits_in_unsigned_long (offsetT num ATTRIBUTE_UNUSED)
 }				/* fits_in_unsigned_long() */
 
 static INLINE int
+fits_in_vec_disp8 (offsetT num)
+{
+  int shift = i.memshift;
+  unsigned int mask;
+
+  if (shift == -1)
+    abort ();
+
+  mask = (1 << shift) - 1;
+
+  /* Return 0 if NUM isn't properly aligned.  */
+  if ((num & mask))
+    return 0;
+
+  /* Check if NUM will fit in 8bit after shift.  */
+  return fits_in_signed_byte (num >> shift);
+}
+
+static INLINE int
 fits_in_imm4 (offsetT num)
 {
   return (num & 0xf) == num;
@@ -2440,6 +2575,8 @@ md_begin (void)
 	    register_chars[c] = mnemonic_chars[c];
 	    operand_chars[c] = c;
 	  }
+	else if (c == '{' || c == '}')
+	  operand_chars[c] = c;
 
 	if (ISALPHA (c) || ISDIGIT (c))
 	  identifier_chars[c] = c;
@@ -2532,6 +2669,7 @@ pi (char *line, i386_insn *x)
 	  || x->types[j].bitfield.regmmx
 	  || x->types[j].bitfield.regxmm
 	  || x->types[j].bitfield.regymm
+	  || x->types[j].bitfield.regzmm
 	  || x->types[j].bitfield.sreg2
 	  || x->types[j].bitfield.sreg3
 	  || x->types[j].bitfield.control
@@ -2619,6 +2757,7 @@ const type_names[] =
   { OPERAND_TYPE_DISP32, "d32" },
   { OPERAND_TYPE_DISP32S, "d32s" },
   { OPERAND_TYPE_DISP64, "d64" },
+  { OPERAND_TYPE_VEC_DISP8, "Vector d8" },
   { OPERAND_TYPE_INOUTPORTREG, "InOutPortReg" },
   { OPERAND_TYPE_SHIFTCOUNT, "ShiftCount" },
   { OPERAND_TYPE_CONTROL, "control reg" },
@@ -2633,6 +2772,8 @@ const type_names[] =
   { OPERAND_TYPE_REGMMX, "rMMX" },
   { OPERAND_TYPE_REGXMM, "rXMM" },
   { OPERAND_TYPE_REGYMM, "rYMM" },
+  { OPERAND_TYPE_REGZMM, "rZMM" },
+  { OPERAND_TYPE_REGMASK, "Mask reg" },
   { OPERAND_TYPE_ESSEG, "es" },
 };
 
@@ -2882,7 +3023,11 @@ build_vex_prefix (const insn_template *t)
 
   /* Check register specifier.  */
   if (i.vex.register_specifier)
-    register_specifier = ~register_number (i.vex.register_specifier) & 0xf;
+    {
+      register_specifier =
+	~register_number (i.vex.register_specifier) & 0xf;
+      gas_assert ((i.vex.register_specifier->reg_flags & RegVRex) == 0);
+    }
   else
     register_specifier = 0xf;
 
@@ -3016,6 +3161,175 @@ build_vex_prefix (const insn_template *t)
     }
 }
 
+/* Build the EVEX prefix.  */
+
+static void
+build_evex_prefix (void)
+{
+  unsigned int register_specifier;
+  unsigned int implied_prefix;
+  unsigned int m, w;
+  rex_byte vrex_used = 0;
+
+  /* Check register specifier.  */
+  if (i.vex.register_specifier)
+    {
+      gas_assert ((i.vrex & REX_X) == 0);
+
+      register_specifier = i.vex.register_specifier->reg_num;
+      if ((i.vex.register_specifier->reg_flags & RegRex))
+	register_specifier += 8;
+      /* The upper 16 registers are encoded in the fourth byte of the
+	 EVEX prefix.  */
+      if (!(i.vex.register_specifier->reg_flags & RegVRex))
+	i.vex.bytes[3] = 0x8;
+      register_specifier = ~register_specifier & 0xf;
+    }
+  else
+    {
+      register_specifier = 0xf;
+
+      /* Encode upper 16 vector index register in the fourth byte of
+	 the EVEX prefix.  */
+      if (!(i.vrex & REX_X))
+	i.vex.bytes[3] = 0x8;
+      else
+	vrex_used |= REX_X;
+    }
+
+  switch ((i.tm.base_opcode >> 8) & 0xff)
+    {
+    case 0:
+      implied_prefix = 0;
+      break;
+    case DATA_PREFIX_OPCODE:
+      implied_prefix = 1;
+      break;
+    case REPE_PREFIX_OPCODE:
+      implied_prefix = 2;
+      break;
+    case REPNE_PREFIX_OPCODE:
+      implied_prefix = 3;
+      break;
+    default:
+      abort ();
+    }
+
+  /* 4 byte EVEX prefix.  */
+  i.vex.length = 4;
+  i.vex.bytes[0] = 0x62;
+
+  /* mmmm bits.  */
+  switch (i.tm.opcode_modifier.vexopcode)
+    {
+    case VEX0F:
+      m = 1;
+      break;
+    case VEX0F38:
+      m = 2;
+      break;
+    case VEX0F3A:
+      m = 3;
+      break;
+    default:
+      abort ();
+      break;
+    }
+
+  /* The high 3 bits of the second EVEX byte are 1's compliment of RXB
+     bits from REX.  */
+  i.vex.bytes[1] = (~i.rex & 0x7) << 5 | m;
+
+  /* The fifth bit of the second EVEX byte is 1's compliment of the
+     REX_R bit in VREX.  */
+  if (!(i.vrex & REX_R))
+    i.vex.bytes[1] |= 0x10;
+  else
+    vrex_used |= REX_R;
+
+  if ((i.reg_operands + i.imm_operands) == i.operands)
+    {
+      /* When all operands are registers, the REX_X bit in REX is not
+	 used.  We reuse it to encode the upper 16 registers, which is
+	 indicated by the REX_B bit in VREX.  The REX_X bit is encoded
+	 as 1's compliment.  */
+      if ((i.vrex & REX_B))
+	{
+	  vrex_used |= REX_B;
+	  i.vex.bytes[1] &= ~0x40;
+	}
+    }
+
+  /* EVEX instructions shouldn't need the REX prefix.  */
+  i.vrex &= ~vrex_used;
+  gas_assert (i.vrex == 0);
+
+  /* Check the REX.W bit.  */
+  w = (i.rex & REX_W) ? 1 : 0;
+  if (i.tm.opcode_modifier.vexw)
+    {
+      if (i.tm.opcode_modifier.vexw == VEXW1)
+	w = 1;
+    }
+  /* If w is not set it means we are dealing with WIG instruction.  */
+  else if (!w)
+    {
+      if (evexwig == evexw1)
+        w = 1;
+    }
+
+  /* Encode the U bit.  */
+  implied_prefix |= 0x4;
+
+  /* The third byte of the EVEX prefix.  */
+  i.vex.bytes[2] = (w << 7 | register_specifier << 3 | implied_prefix);
+
+  /* The fourth byte of the EVEX prefix.  */
+  /* The zeroing-masking bit.  */
+  if (i.mask && i.mask->zeroing)
+    i.vex.bytes[3] |= 0x80;
+
+  /* Don't always set the broadcast bit if there is no RC.  */
+  if (!i.rounding)
+    {
+      /* Encode the vector length.  */
+      unsigned int vec_length;
+
+      switch (i.tm.opcode_modifier.evex)
+	{
+	case EVEXLIG: /* LL' is ignored */
+	  vec_length = evexlig << 5;
+	  break;
+	case EVEX128:
+	  vec_length = 0 << 5;
+	  break;
+	case EVEX256:
+	  vec_length = 1 << 5;
+	  break;
+	case EVEX512:
+	  vec_length = 2 << 5;
+	  break;
+	default:
+	  abort ();
+	  break;
+	}
+      i.vex.bytes[3] |= vec_length;
+      /* Encode the broadcast bit.  */
+      if (i.broadcast)
+	i.vex.bytes[3] |= 0x10;
+    }
+  else
+    {
+      if (i.rounding->type != saeonly)
+	i.vex.bytes[3] |= 0x10 | (i.rounding->type << 5);
+      else
+	i.vex.bytes[3] |= 0x10;
+    }
+
+  if (i.mask && i.mask->mask)
+    i.vex.bytes[3] |= i.mask->mask->reg_num;
+}
+
 static void
 process_immext (void)
 {
@@ -3047,9 +3361,10 @@ process_immext (void)
      AVX instructions also use this encoding, for some of
      3 argument instructions.  */
 
-  gas_assert (i.imm_operands == 0
+  gas_assert (i.imm_operands <= 1
 	      && (i.operands <= 2
-		  || (i.tm.opcode_modifier.vex
+		  || ((i.tm.opcode_modifier.vex
+		       || i.tm.opcode_modifier.evex)
 		      && i.operands <= 4)));
 
   exp = &im_expressions[i.imm_operands++];
@@ -3296,6 +3611,9 @@ md_assemble (char *line)
   if (i.tm.opcode_modifier.vex)
     build_vex_prefix (t);
 
+  if (i.tm.opcode_modifier.evex)
+    build_evex_prefix ();
+
   /* Handle conversion of 'int $3' --> special int3 insn.  XOP or FMA4
      instructions may define INT_OPCODE as well, so avoid this corner
      case for those instructions that use MODRM.  */
@@ -3735,6 +4053,28 @@ swap_2_operands (int xchg1, int xchg2)
   temp_reloc = i.reloc[xchg2];
   i.reloc[xchg2] = i.reloc[xchg1];
   i.reloc[xchg1] = temp_reloc;
+
+  if (i.mask)
+    {
+      if (i.mask->operand == xchg1)
+	i.mask->operand = xchg2;
+      else if (i.mask->operand == xchg2)
+	i.mask->operand = xchg1;
+    }
+  if (i.broadcast)
+    {
+      if (i.broadcast->operand == xchg1)
+	i.broadcast->operand = xchg2;
+      else if (i.broadcast->operand == xchg2)
+	i.broadcast->operand = xchg1;
+    }
+  if (i.rounding)
+    {
+      if (i.rounding->operand == xchg1)
+	i.rounding->operand = xchg2;
+      else if (i.rounding->operand == xchg2)
+	i.rounding->operand = xchg1;
+    }
 }
 
 static void
@@ -3990,11 +4330,14 @@ optimize_disp (void)
 static int
 check_VecOperands (const insn_template *t)
 {
+  unsigned int op;
+
   /* Without VSIB byte, we can't have a vector register for index.  */
   if (!t->opcode_modifier.vecsib
       && i.index_reg
       && (i.index_reg->reg_type.bitfield.regxmm
-	  || i.index_reg->reg_type.bitfield.regymm))
+	  || i.index_reg->reg_type.bitfield.regymm
+	  || i.index_reg->reg_type.bitfield.regzmm))
     {
       i.error = unsupported_vector_index_register;
       return 1;
@@ -4008,31 +4351,188 @@ check_VecOperands (const insn_template *t)
 	  || !((t->opcode_modifier.vecsib == VecSIB128
 		&& i.index_reg->reg_type.bitfield.regxmm)
 	       || (t->opcode_modifier.vecsib == VecSIB256
-		   && i.index_reg->reg_type.bitfield.regymm)))
+		   && i.index_reg->reg_type.bitfield.regymm)
+	       || (t->opcode_modifier.vecsib == VecSIB512
+		   && i.index_reg->reg_type.bitfield.regzmm)))
       {
 	i.error = invalid_vsib_address;
 	return 1;
       }
 
-      gas_assert (i.reg_operands == 2);
-      gas_assert (i.types[0].bitfield.regxmm
-		  || i.types[0].bitfield.regymm);
-      gas_assert (i.types[2].bitfield.regxmm
-		  || i.types[2].bitfield.regymm);
+      gas_assert (i.reg_operands == 2 || i.mask);
+      if (i.reg_operands == 2 && !i.mask)
+	{
+	  gas_assert (i.types[0].bitfield.regxmm
+		      || i.types[0].bitfield.regymm
+		      || i.types[0].bitfield.regzmm);
+	  gas_assert (i.types[2].bitfield.regxmm
+		      || i.types[2].bitfield.regymm
+		      || i.types[2].bitfield.regzmm);
+	  if (operand_check == check_none)
+	    return 0;
+	  if (register_number (i.op[0].regs)
+	      != register_number (i.index_reg)
+	      && register_number (i.op[2].regs)
+		 != register_number (i.index_reg)
+	      && register_number (i.op[0].regs)
+		 != register_number (i.op[2].regs))
+	    return 0;
+	  if (operand_check == check_error)
+	    {
+	      i.error = invalid_vector_register_set;
+	      return 1;
+	    }
+	  as_warn (_("mask, index, and destination registers should be distinct"));
+	}
+    }
+
+  /* Check if broadcast is supported by the instruction and is applied
+     to the memory operand.  */
+  if (i.broadcast)
+    {
+      int broadcasted_opnd_size;
+
+      /* Check if specified broadcast is supported in this instruction,
+	 and it's applied to memory operand of DWORD or QWORD type,
+	 depending on VecESize.  */
+      if (i.broadcast->type != t->opcode_modifier.broadcast
+	  || !i.types[i.broadcast->operand].bitfield.mem
+	  || (t->opcode_modifier.vecesize == 0
+	      && !i.types[i.broadcast->operand].bitfield.dword
+	      && !i.types[i.broadcast->operand].bitfield.unspecified)
+	  || (t->opcode_modifier.vecesize == 1
+	      && !i.types[i.broadcast->operand].bitfield.qword
+	      && !i.types[i.broadcast->operand].bitfield.unspecified))
+	goto bad_broadcast;
+
+      broadcasted_opnd_size = t->opcode_modifier.vecesize ? 64 : 32;
+      if (i.broadcast->type == BROADCAST_1TO16)
+	broadcasted_opnd_size <<= 4; /* Broadcast 1to16.  */
+      else if (i.broadcast->type == BROADCAST_1TO8)
+	broadcasted_opnd_size <<= 3; /* Broadcast 1to8.  */
+      else
+	goto bad_broadcast;
 
-      if (operand_check == check_none)
-	return 0;
-      if (register_number (i.op[0].regs) != register_number (i.index_reg)
-	  && register_number (i.op[2].regs) != register_number (i.index_reg)
-	  && register_number (i.op[0].regs) != register_number (i.op[2].regs))
-	return 0;
-      if (operand_check == check_error)
+      if ((broadcasted_opnd_size == 256
+	   && !t->operand_types[i.broadcast->operand].bitfield.ymmword)
+	  || (broadcasted_opnd_size == 512
+	      && !t->operand_types[i.broadcast->operand].bitfield.zmmword))
+	{
+	bad_broadcast:
+	  i.error = unsupported_broadcast;
+	  return 1;
+	}
+    }
+  /* If broadcast is supported in this instruction, we need to check if
+     operand of one-element size isn't specified without broadcast.  */
+  else if (t->opcode_modifier.broadcast && i.mem_operands)
+    {
+      /* Find memory operand.  */
+      for (op = 0; op < i.operands; op++)
+	if (operand_type_check (i.types[op], anymem))
+	  break;
+      gas_assert (op < i.operands);
+      /* Check size of the memory operand.  */
+      if ((t->opcode_modifier.vecesize == 0
+	   && i.types[op].bitfield.dword)
+	  || (t->opcode_modifier.vecesize == 1
+	      && i.types[op].bitfield.qword))
+	{
+	  i.error = broadcast_needed;
+	  return 1;
+	}
+    }
+
+  /* Check if requested masking is supported.  */
+  if (i.mask
+      && (!t->opcode_modifier.masking
+	  || (i.mask->zeroing
+	      && t->opcode_modifier.masking == MERGING_MASKING)))
+    {
+      i.error = unsupported_masking;
+      return 1;
+    }
+
+  /* Check if masking is applied to dest operand.  */
+  if (i.mask && (i.mask->operand != (int) (i.operands - 1)))
+    {
+      i.error = mask_not_on_destination;
+      return 1;
+    }
+
+  /* Check if default mask is allowed.  */
+  if (t->opcode_modifier.nodefmask
+      && (!i.mask || i.mask->mask->reg_num == 0))
+    {
+      i.error = no_default_mask;
+      return 1;
+    }
+
+  /* Check RC/SAE.  */
+  if (i.rounding)
+    {
+      if ((i.rounding->type != saeonly
+	   && !t->opcode_modifier.staticrounding)
+	  || (i.rounding->type == saeonly
+	      && (t->opcode_modifier.staticrounding
+		  || !t->opcode_modifier.sae)))
 	{
-	  i.error = invalid_vector_register_set;
+	  i.error = unsupported_rc_sae;
 	  return 1;
 	}
-      as_warn (_("mask, index, and destination registers should be distinct"));
+      /* If the instruction has several immediate operands and one of
+	 them is rounding, the rounding operand should be the last
+	 immediate operand.  */
+      if (i.imm_operands > 1
+	  && i.rounding->operand != (int) (i.imm_operands - 1))
+	{
+	  i.error = rc_sae_operand_not_last_imm;
+	  return 1;
+	}
+    }
+
+  /* Check vector Disp8 operand.  */
+  if (t->opcode_modifier.disp8memshift)
+    {
+      if (i.broadcast)
+	i.memshift = t->opcode_modifier.vecesize ? 3 : 2;
+      else
+	i.memshift = t->opcode_modifier.disp8memshift;
+
+      for (op = 0; op < i.operands; op++)
+	if (operand_type_check (i.types[op], disp)
+	    && i.op[op].disps->X_op == O_constant)
+	  {
+	    offsetT value = i.op[op].disps->X_add_number;
+	    int vec_disp8_ok = fits_in_vec_disp8 (value);
+	    if (t->operand_types [op].bitfield.vec_disp8)
+	      {
+		if (vec_disp8_ok)
+		  i.types[op].bitfield.vec_disp8 = 1;
+		else
+		  {
+		    /* Vector insn can only have Vec_Disp8/Disp32 in
+		       32/64bit modes, and Vec_Disp8/Disp16 in 16bit
+		       mode.  */
+		    i.types[op].bitfield.disp8 = 0;
+		    if (flag_code != CODE_16BIT)
+		      i.types[op].bitfield.disp16 = 0;
+		  }
+	      }
+	    else if (flag_code != CODE_16BIT)
+	      {
+		/* One form of this instruction supports vector Disp8.
+		   Try vector Disp8 if we need to use Disp32.  */
+		if (vec_disp8_ok && !fits_in_signed_byte (value))
+		  {
+		    i.error = try_vector_disp8;
+		    return 1;
+		  }
+	      }
+	  }
     }
+  else
+    i.memshift = -1;
 
   return 0;
 }
@@ -4043,6 +4543,13 @@ check_VecOperands (const insn_template *t)
 static int
 VEX_check_operands (const insn_template *t)
 {
+  /* VREX is only valid with EVEX prefix.  */
+  if (i.need_vrex && !t->opcode_modifier.evex)
+    {
+      i.error = invalid_register_operand;
+      return 1;
+    }
+
   if (!t->opcode_modifier.vex)
     return 0;
 
@@ -4159,10 +4666,12 @@ match_template (void)
 	      : intel_float_operand (t->name) != 2)
 	  && ((!operand_types[0].bitfield.regmmx
 	       && !operand_types[0].bitfield.regxmm
-	       && !operand_types[0].bitfield.regymm)
+	       && !operand_types[0].bitfield.regymm
+	       && !operand_types[0].bitfield.regzmm)
 	      || (!operand_types[t->operands > 1].bitfield.regmmx
 		  && !!operand_types[t->operands > 1].bitfield.regxmm
-		  && !!operand_types[t->operands > 1].bitfield.regymm))
+		  && !!operand_types[t->operands > 1].bitfield.regymm
+		  && !!operand_types[t->operands > 1].bitfield.regzmm))
 	  && (t->base_opcode != 0x0fc7
 	      || t->extension_opcode != 1 /* cmpxchg8b */))
 	continue;
@@ -4433,6 +4942,36 @@ check_reverse:
 	case unsupported_vector_index_register:
 	  err_msg = _("unsupported vector index register");
 	  break;
+	case unsupported_broadcast:
+	  err_msg = _("unsupported broadcast");
+	  break;
+	case broadcast_not_on_src_operand:
+	  err_msg = _("broadcast not on source memory operand");
+	  break;
+	case broadcast_needed:
+	  err_msg = _("broadcast is needed for operand of such type");
+	  break;
+	case unsupported_masking:
+	  err_msg = _("unsupported masking");
+	  break;
+	case mask_not_on_destination:
+	  err_msg = _("mask not on destination operand");
+	  break;
+	case no_default_mask:
+	  err_msg = _("default mask isn't allowed");
+	  break;
+	case unsupported_rc_sae:
+	  err_msg = _("unsupported static rounding/sae");
+	  break;
+	case rc_sae_operand_not_last_imm:
+	  if (intel_syntax)
+	    err_msg = _("RC/SAE operand must precede immediate operands");
+	  else
+	    err_msg = _("RC/SAE operand must follow immediate operands");
+	  break;
+	case invalid_register_operand:
+	  err_msg = _("invalid register operand");
+	  break;
 	}
       as_bad (_("%s for `%s'"), err_msg,
 	      current_templates->start->name);
@@ -4625,9 +5164,10 @@ process_suffix (void)
 	    return 0;
 	}
       else if (i.suffix == XMMWORD_MNEM_SUFFIX
-	       || i.suffix == YMMWORD_MNEM_SUFFIX)
+	       || i.suffix == YMMWORD_MNEM_SUFFIX
+	       || i.suffix == ZMMWORD_MNEM_SUFFIX)
 	{
-	  /* Skip if the instruction has x/y suffix.  match_template
+	  /* Skip if the instruction has x/y/z suffix.  match_template
 	     should check if it is a valid suffix.  */
 	}
       else if (intel_syntax && i.tm.opcode_modifier.ignoresize)
@@ -4715,7 +5255,8 @@ process_suffix (void)
   if (i.suffix
       && i.suffix != BYTE_MNEM_SUFFIX
       && i.suffix != XMMWORD_MNEM_SUFFIX
-      && i.suffix != YMMWORD_MNEM_SUFFIX)
+      && i.suffix != YMMWORD_MNEM_SUFFIX
+      && i.suffix != ZMMWORD_MNEM_SUFFIX)
     {
       /* It's not a byte, select word/dword operation.  */
       if (i.tm.opcode_modifier.w)
@@ -4835,6 +5376,7 @@ check_byte_reg (void)
 	  || i.types[op].bitfield.regmmx
 	  || i.types[op].bitfield.regxmm
 	  || i.types[op].bitfield.regymm
+	  || i.types[op].bitfield.regzmm
 	  || i.types[op].bitfield.sreg2
 	  || i.types[op].bitfield.sreg3
 	  || i.types[op].bitfield.control
@@ -5206,10 +5748,11 @@ duplicate:
     {
       unsigned int j;
 
-      /* The first operand is implicit and must be xmm0/ymm0.  */
+      /* The first operand is implicit and must be xmm0/ymm0/zmm0.  */
       gas_assert (i.reg_operands
 		  && (operand_type_equal (&i.types[0], &regxmm)
-		      || operand_type_equal (&i.types[0], &regymm)));
+		      || operand_type_equal (&i.types[0], &regymm)
+		      || operand_type_equal (&i.types[0], &regzmm)));
       if (register_number (i.op[0].regs) != 0)
 	return bad_implicit_operand (i.types[0].bitfield.regxmm);
 
@@ -5363,8 +5906,8 @@ build_modrm_byte (void)
       /* There are 2 kinds of instructions:
          1. 5 operands: 4 register operands or 3 register operands
          plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
-         VexW0 or VexW1.  The destination must be either XMM or YMM
-         register.
+         VexW0 or VexW1.  The destination must be either XMM, YMM or
+	 ZMM register.
          2. 4 operands: 4 register operands or 3 register operands
          plus 1 memory operand, VexXDS, and VexImmExt  */
       gas_assert ((i.reg_operands == 4
@@ -5376,7 +5919,8 @@ build_modrm_byte (void)
                           && (i.tm.opcode_modifier.vexw == VEXW0
                               || i.tm.opcode_modifier.vexw == VEXW1)
                           && (operand_type_equal (&i.tm.operand_types[dest], &regxmm)
-                              || operand_type_equal (&i.tm.operand_types[dest], &regymm)))));
+                              || operand_type_equal (&i.tm.operand_types[dest], &regymm)
+                              || operand_type_equal (&i.tm.operand_types[dest], &regzmm)))));
 
       if (i.imm_operands == 0)
         {
@@ -5411,10 +5955,13 @@ build_modrm_byte (void)
           gas_assert (operand_type_equal (&i.tm.operand_types[reg_slot],
 					  &regxmm)
                       || operand_type_equal (&i.tm.operand_types[reg_slot],
-                                             &regymm));
+                                             &regymm)
+                      || operand_type_equal (&i.tm.operand_types[reg_slot],
+                                             &regzmm));
           exp->X_op = O_constant;
           exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
-        }
+	  gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
+	}
       else
         {
           unsigned int imm_slot;
@@ -5455,14 +6002,19 @@ build_modrm_byte (void)
           gas_assert (operand_type_equal (&i.tm.operand_types[reg_slot],
 					  &regxmm)
 		      || operand_type_equal (&i.tm.operand_types[reg_slot],
-					     &regymm));
+					     &regymm)
+		      || operand_type_equal (&i.tm.operand_types[reg_slot],
+					     &regzmm));
           i.op[imm_slot].imms->X_add_number
               |= register_number (i.op[reg_slot].regs) << 4;
+	  gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
         }
 
       gas_assert (operand_type_equal (&i.tm.operand_types[nds], &regxmm)
                   || operand_type_equal (&i.tm.operand_types[nds],
-                                         &regymm));
+                                         &regymm)
+                  || operand_type_equal (&i.tm.operand_types[nds],
+                                         &regzmm));
       i.vex.register_specifier = i.op[nds].regs;
     }
   else
@@ -5515,7 +6067,8 @@ build_modrm_byte (void)
 		      || (i.tm.opcode_modifier.vexvvvv == VEXXDS
 			  && i.imm_operands == 1
 			  && (i.types[0].bitfield.imm8
-			      || i.types[i.operands - 1].bitfield.imm8)));
+			      || i.types[i.operands - 1].bitfield.imm8
+			      || i.rounding)));
 	  if (i.imm_operands == 2)
 	    source = 2;
 	  else
@@ -5527,6 +6080,23 @@ build_modrm_byte (void)
 	    }
 	  break;
 	case 5:
+	  if (i.tm.opcode_modifier.evex)
+	    {
+	      /* For EVEX instructions, when there are 5 operands, the
+		 first one must be immediate operand.  If the second one
+		 is immediate operand, the source operand is the 3th
+		 one.  If the last one is immediate operand, the source
+		 operand is the 2nd one.  */
+	      gas_assert (i.imm_operands == 2
+			  && i.tm.opcode_modifier.sae
+			  && operand_type_check (i.types[0], imm));
+	      if (operand_type_check (i.types[1], imm))
+		source = 2;
+	      else if (operand_type_check (i.types[4], imm))
+		source = 1;
+	      else
+		abort ();
+	    }
 	  break;
 	default:
 	  abort ();
@@ -5536,13 +6106,18 @@ build_modrm_byte (void)
 	{
 	  dest = source + 1;
 
+	  /* RC/SAE operand could be between DEST and SRC.  That happens
+	     when one operand is GPR and the other one is XMM/YMM/ZMM
+	     register.  */
+	  if (i.rounding && i.rounding->operand == (int) dest)
+	    dest++;
+
 	  if (i.tm.opcode_modifier.vexvvvv == VEXXDS)
 	    {
-	      /* For instructions with VexNDS, the register-only
-		 source operand must be 32/64bit integer, XMM or
-		 YMM register.  It is encoded in VEX prefix.  We
-		 need to clear RegMem bit before calling
-		 operand_type_equal.  */
+	      /* For instructions with VexNDS, the register-only source
+		 operand must be 32/64bit integer, XMM, YMM or ZMM
+		 register.  It is encoded in VEX prefix.  We need to
+		 clear RegMem bit before calling operand_type_equal.  */
 
 	      i386_operand_type op;
 	      unsigned int vvvv;
@@ -5564,7 +6139,9 @@ build_modrm_byte (void)
 		  || (op.bitfield.reg32 != 1
 		      && !op.bitfield.reg64 != 1
 		      && !operand_type_equal (&op, &regxmm)
-		      && !operand_type_equal (&op, &regymm)))
+		      && !operand_type_equal (&op, &regymm)
+		      && !operand_type_equal (&op, &regzmm)
+		      && !operand_type_equal (&op, &regmask)))
 		abort ();
 	      i.vex.register_specifier = i.op[vvvv].regs;
 	      dest++;
@@ -5585,8 +6162,12 @@ build_modrm_byte (void)
 	  i.rm.regmem = i.op[source].regs->reg_num;
 	  if ((i.op[dest].regs->reg_flags & RegRex) != 0)
 	    i.rex |= REX_R;
+	  if ((i.op[dest].regs->reg_flags & RegVRex) != 0)
+	    i.vrex |= REX_R;
 	  if ((i.op[source].regs->reg_flags & RegRex) != 0)
 	    i.rex |= REX_B;
+	  if ((i.op[source].regs->reg_flags & RegVRex) != 0)
+	    i.vrex |= REX_B;
 	}
       else
 	{
@@ -5594,8 +6175,12 @@ build_modrm_byte (void)
 	  i.rm.regmem = i.op[dest].regs->reg_num;
 	  if ((i.op[dest].regs->reg_flags & RegRex) != 0)
 	    i.rex |= REX_B;
+	  if ((i.op[dest].regs->reg_flags & RegVRex) != 0)
+	    i.vrex |= REX_B;
 	  if ((i.op[source].regs->reg_flags & RegRex) != 0)
 	    i.rex |= REX_R;
+	  if ((i.op[source].regs->reg_flags & RegVRex) != 0)
+	    i.vrex |= REX_R;
 	}
       if (flag_code != CODE_64BIT && (i.rex & (REX_R | REX_B)))
 	{
@@ -5631,6 +6216,8 @@ build_modrm_byte (void)
 		{
 		  i.sib.base = NO_BASE_REGISTER;
 		  i.sib.scale = i.log2_scale_factor;
+		  /* No Vec_Disp8 if there is no base.  */
+		  i.types[op].bitfield.vec_disp8 = 0;
 		  i.types[op].bitfield.disp8 = 0;
 		  i.types[op].bitfield.disp16 = 0;
 		  i.types[op].bitfield.disp64 = 0;
@@ -5649,6 +6236,8 @@ build_modrm_byte (void)
 	      i.sib.index = i.index_reg->reg_num;
 	      if ((i.index_reg->reg_flags & RegRex) != 0)
 		i.rex |= REX_X;
+	      if ((i.index_reg->reg_flags & RegVRex) != 0)
+		i.vrex |= REX_X;
 	    }
 
 	  default_seg = &ds;
@@ -5703,6 +6292,8 @@ build_modrm_byte (void)
 		  i.sib.base = NO_BASE_REGISTER;
 		  i.sib.scale = i.log2_scale_factor;
 		  i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
+		  /* No Vec_Disp8 if there is no base.  */
+		  i.types[op].bitfield.vec_disp8 = 0;
 		  i.types[op].bitfield.disp8 = 0;
 		  i.types[op].bitfield.disp16 = 0;
 		  i.types[op].bitfield.disp64 = 0;
@@ -5732,6 +6323,7 @@ build_modrm_byte (void)
 	      i.types[op].bitfield.disp32 = 0;
 	      i.types[op].bitfield.disp32s = 1;
 	      i.types[op].bitfield.disp64 = 0;
+	      i.types[op].bitfield.vec_disp8 = 0;
 	      i.flags[op] |= Operand_PCrel;
 	      if (! i.disp_operands)
 		fake_zero_displacement = 1;
@@ -5755,7 +6347,10 @@ build_modrm_byte (void)
 		      if (operand_type_check (i.types[op], disp) == 0)
 			{
 			  /* fake (%bp) into 0(%bp)  */
-			  i.types[op].bitfield.disp8 = 1;
+			  if (i.tm.operand_types[op].bitfield.vec_disp8)
+			    i.types[op].bitfield.vec_disp8 = 1;
+			  else
+			    i.types[op].bitfield.disp8 = 1;
 			  fake_zero_displacement = 1;
 			}
 		    }
@@ -5775,6 +6370,8 @@ build_modrm_byte (void)
 		  i386_operand_type temp;
 		  operand_type_set (&temp, 0);
 		  temp.bitfield.disp8 = i.types[op].bitfield.disp8;
+		  temp.bitfield.vec_disp8
+		    = i.types[op].bitfield.vec_disp8;
 		  i.types[op] = temp;
 		  if (i.prefix[ADDR_PREFIX] == 0)
 		    i.types[op].bitfield.disp32s = 1;
@@ -5796,7 +6393,10 @@ build_modrm_byte (void)
 	      if (i.base_reg->reg_num == 5 && i.disp_operands == 0)
 		{
 		  fake_zero_displacement = 1;
-		  i.types[op].bitfield.disp8 = 1;
+		  if (i.tm.operand_types [op].bitfield.vec_disp8)
+		    i.types[op].bitfield.vec_disp8 = 1;
+		  else
+		    i.types[op].bitfield.disp8 = 1;
 		}
 	      i.sib.scale = i.log2_scale_factor;
 	      if (i.index_reg == 0)
@@ -5924,6 +6524,8 @@ build_modrm_byte (void)
 		|| i.types[op].bitfield.regxmm
 		|| i.types[op].bitfield.regymm
 		|| i.types[op].bitfield.regbnd
+		|| i.types[op].bitfield.regzmm
+		|| i.types[op].bitfield.regmask
 		|| i.types[op].bitfield.sreg2
 		|| i.types[op].bitfield.sreg3
 		|| i.types[op].bitfield.control
@@ -5990,7 +6592,9 @@ build_modrm_byte (void)
 	      if (type->bitfield.reg32 != 1
 		  && type->bitfield.reg64 != 1
 		  && !operand_type_equal (type, &regxmm)
-		  && !operand_type_equal (type, &regymm))
+		  && !operand_type_equal (type, &regymm)
+		  && !operand_type_equal (type, &regzmm)
+		  && !operand_type_equal (type, &regmask))
 		abort ();
 
 	      i.vex.register_specifier = i.op[vex_reg].regs;
@@ -6006,12 +6610,16 @@ build_modrm_byte (void)
 		  i.rm.regmem = i.op[op].regs->reg_num;
 		  if ((i.op[op].regs->reg_flags & RegRex) != 0)
 		    i.rex |= REX_B;
+		  if ((i.op[op].regs->reg_flags & RegVRex) != 0)
+		    i.vrex |= REX_B;
 		}
 	      else
 		{
 		  i.rm.reg = i.op[op].regs->reg_num;
 		  if ((i.op[op].regs->reg_flags & RegRex) != 0)
 		    i.rex |= REX_R;
+		  if ((i.op[op].regs->reg_flags & RegVRex) != 0)
+		    i.vrex |= REX_R;
 		}
 	    }
 
@@ -6290,9 +6898,9 @@ output_insn (void)
       unsigned int j;
       unsigned int prefix;
 
-      /* Since the VEX prefix contains the implicit prefix, we don't
-	 need the explicit prefix.  */
-      if (!i.tm.opcode_modifier.vex)
+      /* Since the VEX/EVEX prefix contains the implicit prefix, we
+	 don't need the explicit prefix.  */
+      if (!i.tm.opcode_modifier.vex && !i.tm.opcode_modifier.evex)
 	{
 	  switch (i.tm.opcode_length)
 	    {
@@ -6349,6 +6957,11 @@ check_prefix:
 		  abort ();
 		}
 
+	  /* For EVEX instructions i.vrex should become 0 after
+	     build_evex_prefix.  For VEX instructions upper 16 registers
+	     aren't available, so VREX should be 0.  */
+	  if (i.vrex)
+	    abort ();
 	  /* Now the VEX prefix.  */
 	  p = frag_more (i.vex.length);
 	  for (j = 0; j < i.vex.length; j++)
@@ -6364,6 +6977,11 @@ check_prefix:
 	{
 	  switch (i.tm.opcode_length)
 	    {
+	    case 4:
+	      p = frag_more (4);
+	      *p++ = (i.tm.base_opcode >> 24) & 0xff;
+	      *p++ = (i.tm.base_opcode >> 16) & 0xff;
+	      break;
 	    case 3:
 	      p = frag_more (3);
 	      *p++ = (i.tm.base_opcode >> 16) & 0xff;
@@ -6420,7 +7038,11 @@ static int
 disp_size (unsigned int n)
 {
   int size = 4;
-  if (i.types[n].bitfield.disp64)
+
+  /* Vec_Disp8 has to be 8bit.  */
+  if (i.types[n].bitfield.vec_disp8)
+    size = 1;
+  else if (i.types[n].bitfield.disp64)
     size = 8;
   else if (i.types[n].bitfield.disp8)
     size = 1;
@@ -6452,15 +7074,17 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
 
   for (n = 0; n < i.operands; n++)
     {
-      if (operand_type_check (i.types[n], disp))
+      if (i.types[n].bitfield.vec_disp8
+	  || operand_type_check (i.types[n], disp))
 	{
 	  if (i.op[n].disps->X_op == O_constant)
 	    {
 	      int size = disp_size (n);
-	      offsetT val;
+	      offsetT val = i.op[n].disps->X_add_number;
 
-	      val = offset_in_range (i.op[n].disps->X_add_number,
-				     size);
+	      if (i.types[n].bitfield.vec_disp8)
+		val >>= i.memshift;
+	      val = offset_in_range (val, size);
 	      p = frag_more (size);
 	      md_number_to_chars (p, val, size);
 	    }
@@ -6553,6 +7177,10 @@ output_imm (fragS *insn_start_frag, offsetT insn_start_off)
 
   for (n = 0; n < i.operands; n++)
     {
+      /* Skip SAE/RC Imm operand in EVEX.  They are already handled.  */
+      if (i.rounding && (int) n == i.rounding->operand)
+	continue;
+
       if (operand_type_check (i.types[n], imm))
 	{
 	  if (i.op[n].imms->X_op == O_constant)
@@ -7052,6 +7680,140 @@ pe_directive_secrel (int dummy ATTRIBUTE_UNUSED)
 }
 #endif
 
+/* Handle Vector operations.  */
+
+static char *
+check_VecOperations (char *op_string, char *op_end)
+{
+  const reg_entry *mask;
+  const char *saved;
+  char *end_op;
+
+  while (*op_string
+	 && (op_end == NULL || op_string < op_end))
+    {
+      saved = op_string;
+      if (*op_string == '{')
+	{
+	  op_string++;
+
+	  /* Check broadcasts.  */
+	  if (strncmp (op_string, "1to", 3) == 0)
+	    {
+	      int bcst_type;
+
+	      if (i.broadcast)
+		goto duplicated_vec_op;
+
+	      op_string += 3;
+	      if (*op_string == '8')
+		bcst_type = BROADCAST_1TO8;
+	      else if (*op_string == '1'
+		       && *(op_string+1) == '6')
+		{
+		  bcst_type = BROADCAST_1TO16;
+		  op_string++;
+		}
+	      else
+		{
+		  as_bad (_("Unsupported broadcast: `%s'"), saved);
+		  return NULL;
+		}
+	      op_string++;
+
+	      broadcast_op.type = bcst_type;
+	      broadcast_op.operand = this_operand;
+	      i.broadcast = &broadcast_op;
+	    }
+	  /* Check masking operation.  */
+	  else if ((mask = parse_register (op_string, &end_op)) != NULL)
+	    {
+	      /* k0 can't be used for write mask.  */
+	      if (mask->reg_num == 0)
+		{
+		  as_bad (_("`%s' can't be used for write mask"),
+			  op_string);
+		  return NULL;
+		}
+
+	      if (!i.mask)
+		{
+		  mask_op.mask = mask;
+		  mask_op.zeroing = 0;
+		  mask_op.operand = this_operand;
+		  i.mask = &mask_op;
+		}
+	      else
+		{
+		  if (i.mask->mask)
+		    goto duplicated_vec_op;
+
+		  i.mask->mask = mask;
+
+		  /* Only "{z}" is allowed here.  No need to check
+		     zeroing mask explicitly.  */
+		  if (i.mask->operand != this_operand)
+		    {
+		      as_bad (_("invalid write mask `%s'"), saved);
+		      return NULL;
+		    }
+		}
+
+	      op_string = end_op;
+	    }
+	  /* Check zeroing-flag for masking operation.  */
+	  else if (*op_string == 'z')
+	    {
+	      if (!i.mask)
+		{
+		  mask_op.mask = NULL;
+		  mask_op.zeroing = 1;
+		  mask_op.operand = this_operand;
+		  i.mask = &mask_op;
+		}
+	      else
+		{
+		  if (i.mask->zeroing)
+		    {
+		    duplicated_vec_op:
+		      as_bad (_("duplicated `%s'"), saved);
+		      return NULL;
+		    }
+
+		  i.mask->zeroing = 1;
+
+		  /* Only "{%k}" is allowed here.  No need to check mask
+		     register explicitly.  */
+		  if (i.mask->operand != this_operand)
+		    {
+		      as_bad (_("invalid zeroing-masking `%s'"),
+			      saved);
+		      return NULL;
+		    }
+		}
+
+	      op_string++;
+	    }
+	  else
+	    goto unknown_vec_op;
+
+	  if (*op_string != '}')
+	    {
+	      as_bad (_("missing `}' in `%s'"), saved);
+	      return NULL;
+	    }
+	  op_string++;
+	  continue;
+	}
+    unknown_vec_op:
+      /* We don't know this one.  */
+      as_bad (_("unknown vector operation: `%s'"), saved);
+      return NULL;
+    }
+
+  return op_string;
+}
+
 static int
 i386_immediate (char *imm_start)
 {
@@ -7086,6 +7848,16 @@ i386_immediate (char *imm_start)
   exp_seg = expression (exp);
 
   SKIP_WHITESPACE ();
+
+  /* Handle vector operations.  */
+  if (*input_line_pointer == '{')
+    {
+      input_line_pointer = check_VecOperations (input_line_pointer,
+						NULL);
+      if (input_line_pointer == NULL)
+	return 0;
+    }
+
   if (*input_line_pointer)
     as_bad (_("junk `%s' after expression"), input_line_pointer);
 
@@ -7578,6 +8350,7 @@ bad_address:
 	      || (i.index_reg
 		  && !i.index_reg->reg_type.bitfield.regxmm
 		  && !i.index_reg->reg_type.bitfield.regymm
+		  && !i.index_reg->reg_type.bitfield.regzmm
 		  && ((addr_mode == CODE_64BIT
 		       ? !(i.index_reg->reg_type.bitfield.reg64
 			   || i.index_reg->reg_num == RegRiz)
@@ -7605,6 +8378,67 @@ bad_address:
   return 1;
 }
 
+/* Handle vector immediates.  */
+
+static int
+RC_SAE_immediate (const char *imm_start)
+{
+  unsigned int match_found, j;
+  const char *pstr = imm_start;
+  expressionS *exp;
+
+  if (*pstr != '{')
+    return 0;
+
+  pstr++;
+  match_found = 0;
+  for (j = 0; j < ARRAY_SIZE (RC_NamesTable); j++)
+    {
+      if (!strncmp (pstr, RC_NamesTable[j].name, RC_NamesTable[j].len))
+	{
+	  if (!i.rounding)
+	    {
+	      rc_op.type = RC_NamesTable[j].type;
+	      rc_op.operand = this_operand;
+	      i.rounding = &rc_op;
+	    }
+	  else
+	    {
+	      as_bad (_("duplicated `%s'"), imm_start);
+	      return 0;
+	    }
+	  pstr += RC_NamesTable[j].len;
+	  match_found = 1;
+	  break;
+	}
+    }
+  if (!match_found)
+    return 0;
+
+  if (*pstr++ != '}')
+    {
+      as_bad (_("Missing '}': '%s'"), imm_start);
+      return 0;
+    }
+  /* RC/SAE immediate string should contain nothing more.  */;
+  if (*pstr != 0)
+    {
+      as_bad (_("Junk after '}': '%s'"), imm_start);
+      return 0;
+    }
+
+  exp = &im_expressions[i.imm_operands++];
+  i.op[this_operand].imms = exp;
+
+  exp->X_op = O_constant;
+  exp->X_add_number = 0;
+  exp->X_add_symbol = (symbolS *) 0;
+  exp->X_op_symbol = (symbolS *) 0;
+
+  i.types[this_operand].bitfield.imm8 = 1;
+  return 1;
+}
+
 /* Parse OPERAND_STRING into the i386_insn structure I.  Returns zero
    on error.  */
 
@@ -7687,6 +8521,15 @@ i386_att_operand (char *operand_string)
 	    }
 	  goto do_memory_reference;
 	}
+
+      /* Handle vector operations.  */
+      if (*op_string == '{')
+	{
+	  op_string = check_VecOperations (op_string, NULL);
+	  if (op_string == NULL)
+	    return 0;
+	}
+
       if (*op_string)
 	{
 	  as_bad (_("junk `%s' after register"), op_string);
@@ -7716,6 +8559,11 @@ i386_att_operand (char *operand_string)
       if (!i386_immediate (op_string))
 	return 0;
     }
+  else if (RC_SAE_immediate (operand_string))
+    {
+      /* If it is a RC or SAE immediate, do nothing.  */
+      ;
+    }
   else if (is_digit_char (*op_string)
 	   || is_identifier_char (*op_string)
 	   || *op_string == '(')
@@ -7726,6 +8574,7 @@ i386_att_operand (char *operand_string)
       /* Start and end of displacement string expression (if found).  */
       char *displacement_string_start;
       char *displacement_string_end;
+      char *vop_start;
 
     do_memory_reference:
       if ((i.mem_operands == 1
@@ -7743,6 +8592,15 @@ i386_att_operand (char *operand_string)
 	 after the '('.  */
       base_string = op_string + strlen (op_string);
 
+      /* Handle vector operations.  */
+      vop_start = strchr (op_string, '{');
+      if (vop_start && vop_start < base_string)
+	{
+	  if (check_VecOperations (vop_start, base_string) == NULL)
+	    return 0;
+	  base_string = vop_start;
+	}
+
       --base_string;
       if (is_space_char (*base_string))
 	--base_string;
@@ -8446,11 +9304,26 @@ parse_real_register (char *reg_string, char **end_op)
   if (r->reg_type.bitfield.regymm && !cpu_arch_flags.bitfield.cpuavx)
     return (const reg_entry *) NULL;
 
+  if ((r->reg_type.bitfield.regzmm || r->reg_type.bitfield.regmask)
+       && !cpu_arch_flags.bitfield.cpuavx512f)
+    return (const reg_entry *) NULL;
+
   /* Don't allow fake index register unless allow_index_reg isn't 0. */
   if (!allow_index_reg
       && (r->reg_num == RegEiz || r->reg_num == RegRiz))
     return (const reg_entry *) NULL;
 
+  /* Upper 16 vector register is only available with VREX in 64bit
+     mode.  */
+  if ((r->reg_flags & RegVRex))
+    {
+      if (!cpu_arch_flags.bitfield.cpuvrex
+	  || flag_code != CODE_64BIT)
+	return (const reg_entry *) NULL;
+
+      i.need_vrex = 1;
+    }
+
   if (((r->reg_flags & (RegRex64 | RegRex))
        || r->reg_type.bitfield.reg64)
       && (!cpu_arch_flags.bitfield.cpulm
@@ -8583,6 +9456,8 @@ const char *md_shortopts = "qn";
 #define OPTION_MAVXSCALAR (OPTION_MD_BASE + 13)
 #define OPTION_X32 (OPTION_MD_BASE + 14)
 #define OPTION_MADD_BND_PREFIX (OPTION_MD_BASE + 15)
+#define OPTION_MEVEXLIG (OPTION_MD_BASE + 16)
+#define OPTION_MEVEXWIG (OPTION_MD_BASE + 17)
 
 struct option md_longopts[] =
 {
@@ -8607,6 +9482,8 @@ struct option md_longopts[] =
   {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
   {"mavxscalar", required_argument, NULL, OPTION_MAVXSCALAR},
   {"madd-bnd-prefix", no_argument, NULL, OPTION_MADD_BND_PREFIX},
+  {"mevexlig", required_argument, NULL, OPTION_MEVEXLIG},
+  {"mevexwig", required_argument, NULL, OPTION_MEVEXWIG},
   {NULL, no_argument, NULL, 0}
 };
 size_t md_longopts_size = sizeof (md_longopts);
@@ -8868,6 +9745,26 @@ md_parse_option (int c, char *arg)
       add_bnd_prefix = 1;
       break;
 
+    case OPTION_MEVEXLIG:
+      if (strcmp (arg, "128") == 0)
+	evexlig = evexl128;
+      else if (strcmp (arg, "256") == 0)
+	evexlig = evexl256;
+      else  if (strcmp (arg, "512") == 0)
+	evexlig = evexl512;
+      else
+	as_fatal (_("invalid -mevexlig= option: `%s'"), arg);
+      break;
+
+    case OPTION_MEVEXWIG:
+      if (strcmp (arg, "0") == 0)
+	evexwig = evexw0;
+      else if (strcmp (arg, "1") == 0)
+	evexwig = evexw1;
+      else
+	as_fatal (_("invalid -mevexwig= option: `%s'"), arg);
+      break;
+
     default:
       return 0;
     }
@@ -9003,6 +9900,12 @@ md_show_usage (FILE *stream)
   -mavxscalar=[128|256]   encode scalar AVX instructions with specific vector\n\
                            length\n"));
   fprintf (stream, _("\
+  -mevexlig=[128|256|512] encode scalar EVEX instructions with specific vector\n\
+                           length\n"));
+  fprintf (stream, _("\
+  -mevexwig=[0|1]         encode EVEX instructions with specific EVEX.W value\n\
+                           for EVEX.W bit ignored instructions\n"));
+  fprintf (stream, _("\
   -mmnemonic=[att|intel]  use AT&T/Intel mnemonic\n"));
   fprintf (stream, _("\
   -msyntax=[att|intel]    use AT&T/Intel syntax\n"));