diff options
Diffstat (limited to 'deps/v8/src/codegen/x64')
-rw-r--r-- | deps/v8/src/codegen/x64/assembler-x64-inl.h | 2 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/assembler-x64.cc | 37 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/assembler-x64.h | 9 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/interface-descriptors-x64.cc | 20 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/macro-assembler-x64.cc | 518 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/macro-assembler-x64.h | 66 | ||||
-rw-r--r-- | deps/v8/src/codegen/x64/sse-instr.h | 7 |
7 files changed, 603 insertions, 56 deletions
diff --git a/deps/v8/src/codegen/x64/assembler-x64-inl.h b/deps/v8/src/codegen/x64/assembler-x64-inl.h index 26e558b4f9..836566a1ac 100644 --- a/deps/v8/src/codegen/x64/assembler-x64-inl.h +++ b/deps/v8/src/codegen/x64/assembler-x64-inl.h @@ -19,7 +19,7 @@ bool CpuFeatures::SupportsOptimizer() { return true; } bool CpuFeatures::SupportsWasmSimd128() { if (IsSupported(SSE4_1)) return true; - if (FLAG_wasm_simd_ssse3_codegen) return true; + if (FLAG_wasm_simd_ssse3_codegen && IsSupported(SSSE3)) return true; return false; } diff --git a/deps/v8/src/codegen/x64/assembler-x64.cc b/deps/v8/src/codegen/x64/assembler-x64.cc index e5baf0aa04..18330a9126 100644 --- a/deps/v8/src/codegen/x64/assembler-x64.cc +++ b/deps/v8/src/codegen/x64/assembler-x64.cc @@ -108,6 +108,12 @@ void CpuFeatures::ProbeImpl(bool cross_compile) { } else if (strcmp(FLAG_mcpu, "atom") == 0) { supported_ |= 1u << ATOM; } + + // Set a static value on whether Simd is supported. + // This variable is only used for certain archs to query SupportWasmSimd128() + // at runtime in builtins using an extern ref. Other callers should use + // CpuFeatures::SupportWasmSimd128(). + CpuFeatures::supports_wasm_simd_128_ = CpuFeatures::SupportsWasmSimd128(); } void CpuFeatures::PrintTarget() {} @@ -1188,6 +1194,16 @@ void Assembler::cpuid() { emit(0xA2); } +void Assembler::prefetch(Operand src, int level) { + DCHECK(is_uint2(level)); + EnsureSpace ensure_space(this); + emit(0x0F); + emit(0x18); + // Emit hint number in Reg position of RegR/M. + XMMRegister code = XMMRegister::from_code(level); + emit_sse_operand(code, src); +} + void Assembler::cqo() { EnsureSpace ensure_space(this); emit_rex_64(); @@ -2919,6 +2935,15 @@ void Assembler::movaps(XMMRegister dst, XMMRegister src) { } } +void Assembler::movaps(XMMRegister dst, Operand src) { + DCHECK(!IsEnabled(AVX)); + EnsureSpace ensure_space(this); + emit_optional_rex_32(dst, src); + emit(0x0F); + emit(0x28); + emit_sse_operand(dst, src); +} + void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) { DCHECK(is_uint8(imm8)); EnsureSpace ensure_space(this); @@ -3088,6 +3113,10 @@ void Assembler::cmppd(XMMRegister dst, Operand src, int8_t cmp) { emit(cmp); } +void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) { + sse2_instr(dst, src, 0xF3, 0x0F, 0xE6); +} + void Assembler::cvttss2si(Register dst, Operand src) { DCHECK(!IsEnabled(AVX)); EnsureSpace ensure_space(this); @@ -3503,6 +3532,14 @@ void Assembler::vmovq(Register dst, XMMRegister src) { emit_sse_operand(src, dst); } +void Assembler::vmovdqa(XMMRegister dst, Operand src) { + DCHECK(IsEnabled(AVX)); + EnsureSpace ensure_space(this); + emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F, kWIG); + emit(0x6F); + emit_sse_operand(dst, src); +} + void Assembler::vmovdqa(XMMRegister dst, XMMRegister src) { DCHECK(IsEnabled(AVX)); EnsureSpace ensure_space(this); diff --git a/deps/v8/src/codegen/x64/assembler-x64.h b/deps/v8/src/codegen/x64/assembler-x64.h index a26e98d8a5..c1dc4a3db1 100644 --- a/deps/v8/src/codegen/x64/assembler-x64.h +++ b/deps/v8/src/codegen/x64/assembler-x64.h @@ -786,6 +786,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { void ret(int imm16); void ud2(); void setcc(Condition cc, Register reg); + void prefetch(Operand src, int level); void pblendw(XMMRegister dst, Operand src, uint8_t mask); void pblendw(XMMRegister dst, XMMRegister src, uint8_t mask); @@ -920,6 +921,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { void ucomiss(XMMRegister dst, XMMRegister src); void ucomiss(XMMRegister dst, Operand src); void movaps(XMMRegister dst, XMMRegister src); + void movaps(XMMRegister dst, Operand src); // Don't use this unless it's important to keep the // top half of the destination register unchanged. @@ -1205,6 +1207,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { void movupd(XMMRegister dst, Operand src); void movupd(Operand dst, XMMRegister src); + void cvtdq2pd(XMMRegister dst, XMMRegister src); + void cvttsd2si(Register dst, Operand src); void cvttsd2si(Register dst, XMMRegister src); void cvttss2siq(Register dst, XMMRegister src); @@ -1330,6 +1334,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { } void vmovsd(XMMRegister dst, Operand src) { vsd(0x10, dst, xmm0, src); } void vmovsd(Operand dst, XMMRegister src) { vsd(0x11, src, xmm0, dst); } + void vmovdqa(XMMRegister dst, Operand src); void vmovdqa(XMMRegister dst, XMMRegister src); void vmovdqu(XMMRegister dst, Operand src); void vmovdqu(Operand dst, XMMRegister src); @@ -1399,6 +1404,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) { vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG); } + void vcvtdq2pd(XMMRegister dst, XMMRegister src) { + vinstr(0xe6, dst, xmm0, src, kF3, k0F, kWIG); + } void vcvtss2sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG); } @@ -1513,6 +1521,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { } void vmovaps(XMMRegister dst, XMMRegister src) { vps(0x28, dst, xmm0, src); } + void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); } void vmovups(XMMRegister dst, XMMRegister src) { vps(0x10, dst, xmm0, src); } void vmovups(XMMRegister dst, Operand src) { vps(0x10, dst, xmm0, src); } void vmovups(Operand dst, XMMRegister src) { vps(0x11, src, xmm0, dst); } diff --git a/deps/v8/src/codegen/x64/interface-descriptors-x64.cc b/deps/v8/src/codegen/x64/interface-descriptors-x64.cc index 5b35b5817f..4029b56d2b 100644 --- a/deps/v8/src/codegen/x64/interface-descriptors-x64.cc +++ b/deps/v8/src/codegen/x64/interface-descriptors-x64.cc @@ -93,6 +93,11 @@ const Register ApiGetterDescriptor::CallbackRegister() { return rbx; } const Register GrowArrayElementsDescriptor::ObjectRegister() { return rax; } const Register GrowArrayElementsDescriptor::KeyRegister() { return rbx; } +const Register BaselineLeaveFrameDescriptor::ParamsSizeRegister() { + return rbx; +} +const Register BaselineLeaveFrameDescriptor::WeightRegister() { return rcx; } + void TypeofDescriptor::InitializePlatformSpecific( CallInterfaceDescriptorData* data) { Register registers[] = {rbx}; @@ -216,20 +221,21 @@ void CompareDescriptor::InitializePlatformSpecific( data->InitializePlatformSpecific(arraysize(registers), registers); } +void Compare_BaselineDescriptor::InitializePlatformSpecific( + CallInterfaceDescriptorData* data) { + Register registers[] = {rdx, rax, rbx}; + data->InitializePlatformSpecific(arraysize(registers), registers); +} + void BinaryOpDescriptor::InitializePlatformSpecific( CallInterfaceDescriptorData* data) { Register registers[] = {rdx, rax}; data->InitializePlatformSpecific(arraysize(registers), registers); } -void ArgumentsAdaptorDescriptor::InitializePlatformSpecific( +void BinaryOp_BaselineDescriptor::InitializePlatformSpecific( CallInterfaceDescriptorData* data) { - Register registers[] = { - rdi, // JSFunction - rdx, // the new target - rax, // actual number of arguments - rbx, // expected number of arguments - }; + Register registers[] = {rdx, rax, rbx}; data->InitializePlatformSpecific(arraysize(registers), registers); } diff --git a/deps/v8/src/codegen/x64/macro-assembler-x64.cc b/deps/v8/src/codegen/x64/macro-assembler-x64.cc index e696e8b66e..b91e8319ac 100644 --- a/deps/v8/src/codegen/x64/macro-assembler-x64.cc +++ b/deps/v8/src/codegen/x64/macro-assembler-x64.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include <cstdint> #if V8_TARGET_ARCH_X64 #include "src/base/bits.h" @@ -203,6 +204,15 @@ void TurboAssembler::LoadTaggedPointerField(Register destination, } } +void TurboAssembler::LoadTaggedSignedField(Register destination, + Operand field_operand) { + if (COMPRESS_POINTERS_BOOL) { + DecompressTaggedSigned(destination, field_operand); + } else { + mov_tagged(destination, field_operand); + } +} + void TurboAssembler::LoadAnyTaggedField(Register destination, Operand field_operand) { if (COMPRESS_POINTERS_BOOL) { @@ -256,6 +266,16 @@ void TurboAssembler::StoreTaggedField(Operand dst_field_operand, } } +void TurboAssembler::StoreTaggedSignedField(Operand dst_field_operand, + Smi value) { + if (SmiValuesAre32Bits()) { + movl(Operand(dst_field_operand, kSmiShift / kBitsPerByte), + Immediate(value.value())); + } else { + StoreTaggedField(dst_field_operand, Immediate(value)); + } +} + void TurboAssembler::DecompressTaggedSigned(Register destination, Operand field_operand) { RecordComment("[ DecompressTaggedSigned"); @@ -694,6 +714,16 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1, return bytes; } +void TurboAssembler::Movdqa(XMMRegister dst, Operand src) { + // See comments in Movdqa(XMMRegister, XMMRegister). + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vmovdqa(dst, src); + } else { + movaps(dst, src); + } +} + void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); @@ -1078,17 +1108,7 @@ void TurboAssembler::Set(Operand dst, intptr_t x) { // Smi tagging, untagging and tag detection. Register TurboAssembler::GetSmiConstant(Smi source) { - STATIC_ASSERT(kSmiTag == 0); - int value = source.value(); - if (value == 0) { - xorl(kScratchRegister, kScratchRegister); - return kScratchRegister; - } - if (SmiValuesAre32Bits()) { - Move(kScratchRegister, source); - } else { - movl(kScratchRegister, Immediate(source)); - } + Move(kScratchRegister, source); return kScratchRegister; } @@ -1097,8 +1117,17 @@ void TurboAssembler::Move(Register dst, Smi source) { int value = source.value(); if (value == 0) { xorl(dst, dst); - } else { + } else if (SmiValuesAre32Bits() || value < 0) { Move(dst, source.ptr(), RelocInfo::NONE); + } else { + uint32_t uvalue = static_cast<uint32_t>(source.ptr()); + if (uvalue <= 0xFF) { + // Emit shorter instructions for small Smis + xorl(dst, dst); + movb(dst, Immediate(uvalue)); + } else { + movl(dst, Immediate(uvalue)); + } } } @@ -1340,6 +1369,9 @@ void TurboAssembler::Move(Register dst, Register src) { } } +void TurboAssembler::Move(Register dst, Operand src) { movq(dst, src); } +void TurboAssembler::Move(Register dst, Immediate src) { movl(dst, src); } + void TurboAssembler::Move(XMMRegister dst, XMMRegister src) { if (dst != src) { Movaps(dst, src); @@ -1594,6 +1626,7 @@ void TurboAssembler::Jump(Handle<Code> code_object, RelocInfo::Mode rmode, Address entry = d.InstructionStartOfBuiltin(builtin_index); Move(kScratchRegister, entry, RelocInfo::OFF_HEAP_TARGET); jmp(kScratchRegister); + if (FLAG_code_comments) RecordComment("]"); bind(&skip); return; } @@ -1676,6 +1709,18 @@ void TurboAssembler::CallBuiltin(int builtin_index) { Address entry = d.InstructionStartOfBuiltin(builtin_index); Move(kScratchRegister, entry, RelocInfo::OFF_HEAP_TARGET); call(kScratchRegister); + if (FLAG_code_comments) RecordComment("]"); +} + +void TurboAssembler::TailCallBuiltin(int builtin_index) { + DCHECK(Builtins::IsBuiltinId(builtin_index)); + RecordCommentForOffHeapTrampoline(builtin_index); + CHECK_NE(builtin_index, Builtins::kNoBuiltinId); + EmbeddedData d = EmbeddedData::FromBlob(); + Address entry = d.InstructionStartOfBuiltin(builtin_index); + Move(kScratchRegister, entry, RelocInfo::OFF_HEAP_TARGET); + jmp(kScratchRegister); + if (FLAG_code_comments) RecordComment("]"); } void TurboAssembler::LoadCodeObjectEntry(Register destination, @@ -1726,9 +1771,17 @@ void TurboAssembler::CallCodeObject(Register code_object) { call(code_object); } -void TurboAssembler::JumpCodeObject(Register code_object) { +void TurboAssembler::JumpCodeObject(Register code_object, JumpMode jump_mode) { LoadCodeObjectEntry(code_object, code_object); - jmp(code_object); + switch (jump_mode) { + case JumpMode::kJump: + jmp(code_object); + return; + case JumpMode::kPushAndReturn: + pushq(code_object); + Ret(); + return; + } } void TurboAssembler::RetpolineCall(Register reg) { @@ -1770,29 +1823,69 @@ void TurboAssembler::RetpolineJump(Register reg) { ret(0); } +void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmaddwd(dst, src1, src2); + } else { + if (dst != src1) { + movaps(dst, src1); + } + pmaddwd(dst, src2); + } +} + void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vpmaddwd(dst, src1, src2); } else { - DCHECK_EQ(dst, src1); + if (dst != src1) { + movaps(dst, src1); + } pmaddwd(dst, src2); } } void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, + Operand src2) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmaddubsw(dst, src1, src2); + } else { + CpuFeatureScope ssse3_scope(this, SSSE3); + if (dst != src1) { + movaps(dst, src1); + } + pmaddubsw(dst, src2); + } +} + +void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vpmaddubsw(dst, src1, src2); } else { CpuFeatureScope ssse3_scope(this, SSSE3); - DCHECK_EQ(dst, src1); + if (dst != src1) { + movaps(dst, src1); + } pmaddubsw(dst, src2); } } +void TurboAssembler::Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vunpcklps(dst, src1, src2); + } else { + DCHECK_EQ(dst, src1); + unpcklps(dst, src2); + } +} + void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8) { if (CpuFeatures::IsSupported(AVX)) { @@ -2039,10 +2132,12 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1, void TurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); - // Copy top half (64-bit) of src into both halves of dst. - vpunpckhqdq(dst, src, src); - vpmovsxwd(dst, dst); + // src = |a|b|c|d|e|f|g|h| (high) + // dst = |e|e|f|f|g|g|h|h| + vpunpckhwd(dst, src, src); + vpsrad(dst, dst, 16); } else { + CpuFeatureScope sse_scope(this, SSE4_1); if (dst == src) { // 2 bytes shorter than pshufd, but has depdency on dst. movhlps(dst, src); @@ -2065,6 +2160,7 @@ void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src) { vpxor(scratch, scratch, scratch); vpunpckhwd(dst, src, scratch); } else { + CpuFeatureScope sse_scope(this, SSE4_1); if (dst == src) { // xorps can be executed on more ports than pshufd. xorps(kScratchDoubleReg, kScratchDoubleReg); @@ -2080,10 +2176,12 @@ void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src) { void TurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); - // Copy top half (64-bit) of src into both halves of dst. - vpunpckhqdq(dst, src, src); - vpmovsxbw(dst, dst); + // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high) + // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p| + vpunpckhbw(dst, src, src); + vpsraw(dst, dst, 8); } else { + CpuFeatureScope sse_scope(this, SSE4_1); if (dst == src) { // 2 bytes shorter than pshufd, but has depdency on dst. movhlps(dst, src); @@ -2111,6 +2209,7 @@ void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) { xorps(kScratchDoubleReg, kScratchDoubleReg); punpckhbw(dst, kScratchDoubleReg); } else { + CpuFeatureScope sse_scope(this, SSE4_1); // No dependency on dst. pshufd(dst, src, 0xEE); pmovzxbw(dst, dst); @@ -2118,6 +2217,30 @@ void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) { } } +void TurboAssembler::I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpunpckhqdq(dst, src, src); + vpmovsxdq(dst, dst); + } else { + CpuFeatureScope sse_scope(this, SSE4_1); + pshufd(dst, src, 0xEE); + pmovsxdq(dst, dst); + } +} + +void TurboAssembler::I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); + vpunpckhdq(dst, src, kScratchDoubleReg); + } else { + CpuFeatureScope sse_scope(this, SSE4_1); + pshufd(dst, src, 0xEE); + pmovzxdq(dst, dst); + } +} + // 1. Unpack src0, src0 into even-number elements of scratch. // 2. Unpack src1, src1 into even-number elements of dst. // 3. Multiply 1. with 2. @@ -2189,6 +2312,313 @@ void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1, } } +void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, + XMMRegister src2) { + // k = i16x8.splat(0x8000) + Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + Psllw(kScratchDoubleReg, byte{15}); + + Pmulhrsw(dst, src1, src2); + Pcmpeqw(kScratchDoubleReg, dst); + Pxor(dst, kScratchDoubleReg); +} + +void TurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, + uint8_t laneidx) { + if (laneidx == 0) { + Movss(dst, src); + } else { + DCHECK_GE(3, laneidx); + Extractps(dst, src, laneidx); + } +} + +void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src, + uint8_t laneidx) { + if (laneidx == 0) { + Movlps(dst, src); + } else { + DCHECK_EQ(1, laneidx); + Movhps(dst, src); + } +} + +void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, + XMMRegister tmp) { + DCHECK_NE(dst, tmp); + DCHECK_NE(src, tmp); + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vmovdqa(tmp, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + vpandn(kScratchDoubleReg, tmp, src); + vpand(dst, tmp, src); + vmovdqa(tmp, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask())); + vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4); + vpshufb(dst, tmp, dst); + vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg); + vpaddb(dst, dst, kScratchDoubleReg); + } else if (CpuFeatures::IsSupported(ATOM)) { + // Pre-Goldmont low-power Intel microarchitectures have very slow + // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer + // algorithm on these processors. ATOM CPU feature captures exactly + // the right set of processors. + xorps(tmp, tmp); + pavgb(tmp, src); + if (dst != src) { + movaps(dst, src); + } + andps(tmp, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x55())); + psubb(dst, tmp); + Operand splat_0x33 = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x33()); + movaps(tmp, dst); + andps(dst, splat_0x33); + psrlw(tmp, 2); + andps(tmp, splat_0x33); + paddb(dst, tmp); + movaps(tmp, dst); + psrlw(dst, 4); + paddb(dst, tmp); + andps(dst, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + } else { + movaps(tmp, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + Operand mask = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask()); + Move(kScratchDoubleReg, tmp); + andps(tmp, src); + andnps(kScratchDoubleReg, src); + psrlw(kScratchDoubleReg, 4); + movaps(dst, mask); + pshufb(dst, tmp); + movaps(tmp, mask); + pshufb(tmp, kScratchDoubleReg); + paddb(dst, tmp); + } +} + +void TurboAssembler::F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src) { + // dst = [ src_low, 0x43300000, src_high, 0x4330000 ]; + // 0x43300000'00000000 is a special double where the significand bits + // precisely represents all uint32 numbers. + Unpcklps(dst, src, + ExternalReferenceAsOperand( + ExternalReference:: + address_of_wasm_f64x2_convert_low_i32x4_u_int_mask())); + Subpd(dst, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_double_2_power_52())); +} + +void TurboAssembler::I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + XMMRegister original_dst = dst; + // Make sure we don't overwrite src. + if (dst == src) { + DCHECK_NE(src, kScratchDoubleReg); + dst = kScratchDoubleReg; + } + // dst = 0 if src == NaN, else all ones. + vcmpeqpd(dst, src, src); + // dst = 0 if src == NaN, else INT32_MAX as double. + vandpd(dst, dst, + ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_int32_max_as_double())); + // dst = 0 if src == NaN, src is saturated to INT32_MAX as double. + vminpd(dst, src, dst); + // Values > INT32_MAX already saturated, values < INT32_MIN raises an + // exception, which is masked and returns 0x80000000. + vcvttpd2dq(dst, dst); + if (original_dst != dst) { + Move(original_dst, dst); + } + } else { + if (dst != src) { + Move(dst, src); + } + Move(kScratchDoubleReg, dst); + cmpeqpd(kScratchDoubleReg, dst); + andps(kScratchDoubleReg, + ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_int32_max_as_double())); + minpd(dst, kScratchDoubleReg); + cvttpd2dq(dst, dst); + } +} + +void TurboAssembler::I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vxorpd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); + // Saturate to 0. + vmaxpd(dst, src, kScratchDoubleReg); + // Saturate to UINT32_MAX. + vminpd(dst, dst, + ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_uint32_max_as_double())); + // Truncate. + vroundpd(dst, dst, kRoundToZero); + // Add to special double where significant bits == uint32. + vaddpd(dst, dst, + ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_double_2_power_52())); + // Extract low 32 bits of each double's significand, zero top lanes. + // dst = [dst[0], dst[2], 0, 0] + vshufps(dst, dst, kScratchDoubleReg, 0x88); + } else { + CpuFeatureScope scope(this, SSE4_1); + if (dst != src) { + Move(dst, src); + } + xorps(kScratchDoubleReg, kScratchDoubleReg); + maxpd(dst, kScratchDoubleReg); + minpd(dst, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_uint32_max_as_double())); + roundpd(dst, dst, kRoundToZero); + addpd(dst, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_double_2_power_52())); + shufps(dst, kScratchDoubleReg, 0x88); + } +} + +void TurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src) { + if (CpuFeatures::IsSupported(AVX)) { + XMMRegister tmp = dst == src ? kScratchDoubleReg : dst; + CpuFeatureScope avx_scope(this, AVX); + vpxor(tmp, tmp, tmp); + vpsubq(tmp, tmp, src); + vblendvpd(dst, src, tmp, src); + } else { + CpuFeatureScope sse_scope(this, SSE3); + movshdup(kScratchDoubleReg, src); + if (dst != src) { + movaps(dst, src); + } + psrad(kScratchDoubleReg, 31); + xorps(dst, kScratchDoubleReg); + psubq(dst, kScratchDoubleReg); + } +} + +void TurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0, + XMMRegister src1) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpcmpgtq(dst, src0, src1); + } else if (CpuFeatures::IsSupported(SSE4_2)) { + CpuFeatureScope sse_scope(this, SSE4_2); + DCHECK_EQ(dst, src0); + pcmpgtq(dst, src1); + } else { + DCHECK_NE(dst, src0); + DCHECK_NE(dst, src1); + movdqa(dst, src1); + movdqa(kScratchDoubleReg, src0); + psubq(dst, src0); + pcmpeqd(kScratchDoubleReg, src1); + pand(dst, kScratchDoubleReg); + movdqa(kScratchDoubleReg, src0); + pcmpgtd(kScratchDoubleReg, src1); + por(dst, kScratchDoubleReg); + pshufd(dst, dst, 0xF5); + } +} + +void TurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0, + XMMRegister src1) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpcmpgtq(dst, src1, src0); + vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); + vpxor(dst, dst, kScratchDoubleReg); + } else if (CpuFeatures::IsSupported(SSE4_2)) { + CpuFeatureScope sse_scope(this, SSE4_2); + DCHECK_NE(dst, src0); + if (dst != src1) { + movdqa(dst, src1); + } + pcmpgtq(dst, src0); + pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + pxor(dst, kScratchDoubleReg); + } else { + DCHECK_NE(dst, src0); + DCHECK_NE(dst, src1); + movdqa(dst, src0); + movdqa(kScratchDoubleReg, src1); + psubq(dst, src1); + pcmpeqd(kScratchDoubleReg, src0); + pand(dst, kScratchDoubleReg); + movdqa(kScratchDoubleReg, src1); + pcmpgtd(kScratchDoubleReg, src0); + por(dst, kScratchDoubleReg); + pshufd(dst, dst, 0xF5); + pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + pxor(dst, kScratchDoubleReg); + } +} + +void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, + XMMRegister src) { + // pmaddubsw treats the first operand as unsigned, so the external reference + // to be passed to it as the first operand. + Operand op = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x01()); + if (dst == src) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vmovdqa(kScratchDoubleReg, op); + vpmaddubsw(dst, kScratchDoubleReg, src); + } else { + CpuFeatureScope sse_scope(this, SSSE3); + movaps(kScratchDoubleReg, op); + pmaddubsw(kScratchDoubleReg, src); + movaps(dst, kScratchDoubleReg); + } + } else { + Movdqa(dst, op); + Pmaddubsw(dst, dst, src); + } +} + +void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, + XMMRegister src) { + // src = |a|b|c|d|e|f|g|h| + // kScratchDoubleReg = i32x4.splat(0x0000FFFF) + Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); + Psrld(kScratchDoubleReg, byte{16}); + // kScratchDoubleReg =|0|b|0|d|0|f|0|h| + Pand(kScratchDoubleReg, src); + // dst = |0|a|0|c|0|e|0|g| + Psrld(dst, src, byte{16}); + // dst = |a+b|c+d|e+f|g+h| + Paddd(dst, kScratchDoubleReg); +} + +void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src, + XMMRegister mask) { + // Out-of-range indices should return 0, add 112 so that any value > 15 + // saturates to 128 (top bit set), so pshufb will zero that lane. + Operand op = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_swizzle_mask()); + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpaddusb(kScratchDoubleReg, mask, op); + vpshufb(dst, src, kScratchDoubleReg); + } else { + CpuFeatureScope sse_scope(this, SSSE3); + movdqa(kScratchDoubleReg, op); + if (dst != src) { + movaps(dst, src); + } + paddusb(kScratchDoubleReg, mask); + pshufb(dst, kScratchDoubleReg); + } +} + void TurboAssembler::Abspd(XMMRegister dst) { Andps(dst, ExternalReferenceAsOperand( ExternalReference::address_of_double_abs_constant())); @@ -2432,6 +2862,15 @@ void MacroAssembler::CmpInstanceType(Register map, InstanceType type) { cmpw(FieldOperand(map, Map::kInstanceTypeOffset), Immediate(type)); } +void MacroAssembler::CmpInstanceTypeRange(Register map, + InstanceType lower_limit, + InstanceType higher_limit) { + DCHECK_LT(lower_limit, higher_limit); + movzxwl(kScratchRegister, FieldOperand(map, Map::kInstanceTypeOffset)); + leal(kScratchRegister, Operand(kScratchRegister, 0u - lower_limit)); + cmpl(kScratchRegister, Immediate(higher_limit - lower_limit)); +} + void MacroAssembler::AssertNotSmi(Register object) { if (emit_debug_code()) { Condition is_smi = CheckSmi(object); @@ -2480,9 +2919,10 @@ void MacroAssembler::AssertFunction(Register object) { testb(object, Immediate(kSmiTagMask)); Check(not_equal, AbortReason::kOperandIsASmiAndNotAFunction); Push(object); - CmpObjectType(object, JS_FUNCTION_TYPE, object); + LoadMap(object, object); + CmpInstanceTypeRange(object, FIRST_JS_FUNCTION_TYPE, LAST_JS_FUNCTION_TYPE); Pop(object); - Check(equal, AbortReason::kOperandIsNotAFunction); + Check(below_equal, AbortReason::kOperandIsNotAFunction); } } @@ -2753,7 +3193,6 @@ void MacroAssembler::InvokePrologue(Register expected_parameter_count, Label* done, InvokeFlag flag) { if (expected_parameter_count != actual_parameter_count) { Label regular_invoke; -#ifdef V8_NO_ARGUMENTS_ADAPTOR // If the expected parameter count is equal to the adaptor sentinel, no need // to push undefined value as arguments. cmpl(expected_parameter_count, Immediate(kDontAdaptArgumentsSentinel)); @@ -2811,22 +3250,6 @@ void MacroAssembler::InvokePrologue(Register expected_parameter_count, CallRuntime(Runtime::kThrowStackOverflow); int3(); // This should be unreachable. } -#else - // Both expected and actual are in (different) registers. This - // is the case when we invoke functions using call and apply. - cmpq(expected_parameter_count, actual_parameter_count); - j(equal, ®ular_invoke, Label::kNear); - DCHECK_EQ(actual_parameter_count, rax); - DCHECK_EQ(expected_parameter_count, rbx); - Handle<Code> adaptor = BUILTIN_CODE(isolate(), ArgumentsAdaptorTrampoline); - if (flag == CALL_FUNCTION) { - Call(adaptor, RelocInfo::CODE_TARGET); - jmp(done, Label::kNear); - } else { - Jump(adaptor, RelocInfo::CODE_TARGET); - } -#endif - bind(®ular_invoke); } else { Move(rax, actual_parameter_count); @@ -2881,11 +3304,16 @@ void TurboAssembler::Prologue() { void TurboAssembler::EnterFrame(StackFrame::Type type) { pushq(rbp); movq(rbp, rsp); - Push(Immediate(StackFrame::TypeToMarker(type))); + if (!StackFrame::IsJavaScript(type)) { + Push(Immediate(StackFrame::TypeToMarker(type))); + } } void TurboAssembler::LeaveFrame(StackFrame::Type type) { - if (emit_debug_code()) { + // TODO(v8:11429): Consider passing BASELINE instead, and checking for + // IsJSFrame or similar. Could then unify with manual frame leaves in the + // interpreter too. + if (emit_debug_code() && !StackFrame::IsJavaScript(type)) { cmpq(Operand(rbp, CommonFrameConstants::kContextOrFrameTypeOffset), Immediate(StackFrame::TypeToMarker(type))); Check(equal, AbortReason::kStackFrameTypesMustMatch); @@ -2917,11 +3345,13 @@ void TurboAssembler::AllocateStackSpace(Register bytes_scratch) { } void TurboAssembler::AllocateStackSpace(int bytes) { + DCHECK_GE(bytes, 0); while (bytes > kStackPageSize) { subq(rsp, Immediate(kStackPageSize)); movb(Operand(rsp, 0), Immediate(0)); bytes -= kStackPageSize; } + if (bytes == 0) return; subq(rsp, Immediate(bytes)); } #endif @@ -3223,7 +3653,7 @@ void TurboAssembler::ComputeCodeStartAddress(Register dst) { } void TurboAssembler::ResetSpeculationPoisonRegister() { - // TODO(tebbi): Perhaps, we want to put an lfence here. + // TODO(turbofan): Perhaps, we want to put an lfence here. Set(kSpeculationPoisonRegister, -1); } diff --git a/deps/v8/src/codegen/x64/macro-assembler-x64.h b/deps/v8/src/codegen/x64/macro-assembler-x64.h index df87c07638..be0b07c17f 100644 --- a/deps/v8/src/codegen/x64/macro-assembler-x64.h +++ b/deps/v8/src/codegen/x64/macro-assembler-x64.h @@ -14,6 +14,7 @@ #include "src/codegen/x64/assembler-x64.h" #include "src/common/globals.h" #include "src/objects/contexts.h" +#include "src/objects/tagged-index.h" namespace v8 { namespace internal { @@ -184,6 +185,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { AVX_OP(Sqrtps, sqrtps) AVX_OP(Sqrtpd, sqrtpd) AVX_OP(Cvttps2dq, cvttps2dq) + AVX_OP(Cvttpd2dq, cvttpd2dq) AVX_OP(Ucomiss, ucomiss) AVX_OP(Ucomisd, ucomisd) AVX_OP(Pand, pand) @@ -227,6 +229,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { AVX_OP(Maxps, maxps) AVX_OP(Maxpd, maxpd) AVX_OP(Cvtdq2ps, cvtdq2ps) + AVX_OP(Cvtdq2pd, cvtdq2pd) + AVX_OP(Cvtpd2ps, cvtpd2ps) + AVX_OP(Cvtps2pd, cvtps2pd) AVX_OP(Rcpps, rcpps) AVX_OP(Rsqrtps, rsqrtps) AVX_OP(Addps, addps) @@ -320,6 +325,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void Push(Operand src); void Push(Immediate value); void Push(Smi smi); + void Push(TaggedIndex index) { + Push(Immediate(static_cast<uint32_t>(index.ptr()))); + } void Push(Handle<HeapObject> source); enum class PushArrayOrder { kNormal, kReverse }; @@ -354,6 +362,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { Label::Distance condition_met_distance = Label::kFar); void Movapd(XMMRegister dst, XMMRegister src); + void Movdqa(XMMRegister dst, Operand src); void Movdqa(XMMRegister dst, XMMRegister src); template <typename Dst, typename Src> @@ -438,6 +447,14 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { movq(dst, constant); } + void Move(Register dst, TaggedIndex source) { + movl(dst, Immediate(static_cast<uint32_t>(source.ptr()))); + } + + void Move(Operand dst, TaggedIndex source) { + movl(dst, Immediate(static_cast<uint32_t>(source.ptr()))); + } + void Move(Register dst, ExternalReference ext); void Move(XMMRegister dst, uint32_t src); @@ -450,6 +467,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void Move(Register target, Register source); void Move(XMMRegister target, XMMRegister source); + void Move(Register target, Operand source); + void Move(Register target, Immediate source); + void Move(Register dst, Handle<HeapObject> source, RelocInfo::Mode rmode = RelocInfo::FULL_EMBEDDED_OBJECT); void Move(Operand dst, Handle<HeapObject> source, @@ -505,10 +525,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { Operand EntryFromBuiltinIndexAsOperand(Register builtin_index); void CallBuiltinByIndex(Register builtin_index) override; void CallBuiltin(int builtin_index); + void TailCallBuiltin(int builtin_index); void LoadCodeObjectEntry(Register destination, Register code_object) override; void CallCodeObject(Register code_object) override; - void JumpCodeObject(Register code_object) override; + void JumpCodeObject(Register code_object, + JumpMode jump_mode = JumpMode::kJump) override; void RetpolineCall(Register reg); void RetpolineCall(Address destination, RelocInfo::Mode rmode); @@ -528,10 +550,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void Trap() override; void DebugBreak() override; - // Supports both AVX (dst != src1) and SSE (checks that dst == src1). + // Will move src1 to dst if dst != src1. + void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2); void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2); + void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2); void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2); + void Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2); // Shufps that will mov src1 into dst if AVX is not supported. void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8); @@ -577,6 +602,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src); void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src); + void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); + void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src); // Requires dst == mask when AVX is not supported. void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, @@ -590,6 +617,26 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, bool low, bool is_signed); + void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2); + + void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); + void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx); + + void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); + + void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src); + void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src); + void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src); + + void I64x2Abs(XMMRegister dst, XMMRegister src); + void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1); + void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1); + + void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src); + void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src); + + void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask); + void Abspd(XMMRegister dst); void Negpd(XMMRegister dst); @@ -639,7 +686,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void AllocateStackSpace(int bytes); #else void AllocateStackSpace(Register bytes) { subq(rsp, bytes); } - void AllocateStackSpace(int bytes) { subq(rsp, Immediate(bytes)); } + void AllocateStackSpace(int bytes) { + DCHECK_GE(bytes, 0); + if (bytes == 0) return; + subq(rsp, Immediate(bytes)); + } #endif // Removes current frame and its arguments from the stack preserving the @@ -716,6 +767,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { // compression is enabled. void LoadTaggedPointerField(Register destination, Operand field_operand); + // Loads a field containing a Smi and decompresses it if pointer compression + // is enabled. + void LoadTaggedSignedField(Register destination, Operand field_operand); + // Loads a field containing any tagged value and decompresses it if necessary. void LoadAnyTaggedField(Register destination, Operand field_operand); @@ -736,6 +791,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { // location. void StoreTaggedField(Operand dst_field_operand, Immediate immediate); void StoreTaggedField(Operand dst_field_operand, Register value); + void StoreTaggedSignedField(Operand dst_field_operand, Smi value); // The following macros work even when pointer compression is not enabled. void DecompressTaggedSigned(Register destination, Operand field_operand); @@ -982,6 +1038,10 @@ class V8_EXPORT_PRIVATE MacroAssembler : public TurboAssembler { // Always use unsigned comparisons: above and below, not less and greater. void CmpInstanceType(Register map, InstanceType type); + // Compare instance type ranges for a map (low and high inclusive) + // Always use unsigned comparisons: below_equal for a positive result. + void CmpInstanceTypeRange(Register map, InstanceType low, InstanceType high); + template <typename Field> void DecodeField(Register reg) { static const int shift = Field::kShift; diff --git a/deps/v8/src/codegen/x64/sse-instr.h b/deps/v8/src/codegen/x64/sse-instr.h index 717a79df07..452cc0f690 100644 --- a/deps/v8/src/codegen/x64/sse-instr.h +++ b/deps/v8/src/codegen/x64/sse-instr.h @@ -10,10 +10,12 @@ V(sqrtps, 0F, 51) \ V(rsqrtps, 0F, 52) \ V(rcpps, 0F, 53) \ + V(cvtps2pd, 0F, 5A) \ V(cvtdq2ps, 0F, 5B) // SSE instructions whose AVX version has three operands. #define SSE_BINOP_INSTRUCTION_LIST(V) \ + V(unpcklps, 0F, 14) \ V(andps, 0F, 54) \ V(andnps, 0F, 55) \ V(orps, 0F, 56) \ @@ -108,7 +110,9 @@ #define SSE2_UNOP_INSTRUCTION_LIST(V) \ V(ucomisd, 66, 0F, 2E) \ V(sqrtpd, 66, 0F, 51) \ - V(cvtps2dq, 66, 0F, 5B) + V(cvtpd2ps, 66, 0F, 5A) \ + V(cvtps2dq, 66, 0F, 5B) \ + V(cvttpd2dq, 66, 0F, E6) // SSE2 shift instructions with an immediate operand. The last element is the // extension to the opcode. @@ -183,6 +187,7 @@ // These require AVX2, and we only define the VEX-128 versions. #define AVX2_BROADCAST_LIST(V) \ + V(vpbroadcastd, 66, 0F, 38, 58) \ V(vpbroadcastb, 66, 0F, 38, 78) \ V(vpbroadcastw, 66, 0F, 38, 79) |