diff options
Diffstat (limited to 'deps/v8/src/codegen/shared-ia32-x64')
-rw-r--r-- | deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc | 195 | ||||
-rw-r--r-- | deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h | 221 |
2 files changed, 372 insertions, 44 deletions
diff --git a/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc b/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc index 366d1afac9..3a73ae09f8 100644 --- a/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc +++ b/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc @@ -29,6 +29,174 @@ void SharedTurboAssembler::Movapd(XMMRegister dst, XMMRegister src) { } } +void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, + XMMRegister src2, uint8_t imm8) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vshufps(dst, src1, src2, imm8); + } else { + if (dst != src1) { + movaps(dst, src1); + } + shufps(dst, src2, imm8); + } +} + +void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src, + uint8_t lane) { + if (lane == 0) { + if (dst != src) { + Movaps(dst, src); + } + } else { + DCHECK_EQ(1, lane); + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + // Pass src as operand to avoid false-dependency on dst. + vmovhlps(dst, src, src); + } else { + movhlps(dst, src); + } + } +} + +void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src, + DoubleRegister rep, uint8_t lane) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope scope(this, AVX); + if (lane == 0) { + vpblendw(dst, src, rep, 0b00001111); + } else { + vmovlhps(dst, src, rep); + } + } else { + CpuFeatureScope scope(this, SSE4_1); + if (dst != src) { + DCHECK_NE(dst, rep); // Ensure rep is not overwritten. + movaps(dst, src); + } + if (lane == 0) { + pblendw(dst, rep, 0b00001111); + } else { + movlhps(dst, rep); + } + } +} + +void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs, + XMMRegister rhs, XMMRegister scratch) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope scope(this, AVX); + // The minpd instruction doesn't propagate NaNs and +0's in its first + // operand. Perform minpd in both orders, merge the resuls, and adjust. + vminpd(scratch, lhs, rhs); + vminpd(dst, rhs, lhs); + // propagate -0's and NaNs, which may be non-canonical. + vorpd(scratch, scratch, dst); + // Canonicalize NaNs by quieting and clearing the payload. + vcmpunordpd(dst, dst, scratch); + vorpd(scratch, scratch, dst); + vpsrlq(dst, dst, byte{13}); + vandnpd(dst, dst, scratch); + } else { + // Compare lhs with rhs, and rhs with lhs, and have the results in scratch + // and dst. If dst overlaps with lhs or rhs, we can save a move. + if (dst == lhs || dst == rhs) { + XMMRegister src = dst == lhs ? rhs : lhs; + movaps(scratch, src); + minpd(scratch, dst); + minpd(dst, src); + } else { + movaps(scratch, lhs); + movaps(dst, rhs); + minpd(scratch, rhs); + minpd(dst, lhs); + } + orpd(scratch, dst); + cmpunordpd(dst, scratch); + orpd(scratch, dst); + psrlq(dst, byte{13}); + andnpd(dst, scratch); + } +} + +void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs, + XMMRegister rhs, XMMRegister scratch) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope scope(this, AVX); + // The maxpd instruction doesn't propagate NaNs and +0's in its first + // operand. Perform maxpd in both orders, merge the resuls, and adjust. + vmaxpd(scratch, lhs, rhs); + vmaxpd(dst, rhs, lhs); + // Find discrepancies. + vxorpd(dst, dst, scratch); + // Propagate NaNs, which may be non-canonical. + vorpd(scratch, scratch, dst); + // Propagate sign discrepancy and (subtle) quiet NaNs. + vsubpd(scratch, scratch, dst); + // Canonicalize NaNs by clearing the payload. Sign is non-deterministic. + vcmpunordpd(dst, dst, scratch); + vpsrlq(dst, dst, byte{13}); + vandnpd(dst, dst, scratch); + } else { + if (dst == lhs || dst == rhs) { + XMMRegister src = dst == lhs ? rhs : lhs; + movaps(scratch, src); + maxpd(scratch, dst); + maxpd(dst, src); + } else { + movaps(scratch, lhs); + movaps(dst, rhs); + maxpd(scratch, rhs); + maxpd(dst, lhs); + } + xorpd(dst, scratch); + orpd(scratch, dst); + subpd(scratch, dst); + cmpunordpd(dst, scratch); + psrlq(dst, byte{13}); + andnpd(dst, scratch); + } +} + +void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) { + if (CpuFeatures::IsSupported(AVX2)) { + CpuFeatureScope avx2_scope(this, AVX2); + vbroadcastss(dst, src); + } else if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vshufps(dst, src, src, 0); + } else { + if (dst == src) { + // 1 byte shorter than pshufd. + shufps(dst, src, 0); + } else { + pshufd(dst, src, 0); + } + } +} + +void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src, + uint8_t lane) { + DCHECK_LT(lane, 4); + // These instructions are shorter than insertps, but will leave junk in + // the top lanes of dst. + if (lane == 0) { + if (dst != src) { + Movaps(dst, src); + } + } else if (lane == 1) { + Movshdup(dst, src); + } else if (lane == 2 && dst == src) { + // Check dst == src to avoid false dependency on dst. + Movhlps(dst, src); + } else if (dst == src) { + Shufps(dst, src, src, lane); + } else { + Pshufd(dst, src, lane); + } +} + void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx) { if (laneidx == 0) { @@ -233,6 +401,22 @@ void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, } } +void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src, + XMMRegister scratch) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope scope(this, AVX); + vpxor(scratch, scratch, scratch); + vpsubq(dst, scratch, src); + } else { + if (dst == src) { + movaps(scratch, src); + std::swap(src, scratch); + } + pxor(dst, dst); + psubq(dst, src); + } +} + void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch) { if (CpuFeatures::IsSupported(AVX)) { @@ -379,6 +563,17 @@ void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst, } } +void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src, + XMMRegister scratch) { + if (dst == src) { + Pcmpeqd(scratch, scratch); + Pxor(dst, scratch); + } else { + Pcmpeqd(dst, dst); + Pxor(dst, src); + } +} + void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2, XMMRegister scratch) { diff --git a/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h b/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h index e2778e472d..6be9444c65 100644 --- a/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h +++ b/deps/v8/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h @@ -39,121 +39,252 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { } } - template <typename Dst, typename... Args> + // Shufps that will mov src1 into dst if AVX is not supported. + void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, + uint8_t imm8); + + // Helper struct to implement functions that check for AVX support and + // dispatch to the appropriate AVX/SSE instruction. + template <typename Dst, typename Arg, typename... Args> struct AvxHelper { Assembler* assm; base::Optional<CpuFeature> feature = base::nullopt; // Call a method where the AVX version expects the dst argument to be // duplicated. - template <void (Assembler::*avx)(Dst, Dst, Args...), + // E.g. Andps(x, y) -> vandps(x, x, y) + // -> andps(x, y) + template <void (Assembler::*avx)(Dst, Dst, Arg, Args...), + void (Assembler::*no_avx)(Dst, Arg, Args...)> + void emit(Dst dst, Arg arg, Args... args) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope scope(assm, AVX); + (assm->*avx)(dst, dst, arg, args...); + } else if (feature.has_value()) { + DCHECK(CpuFeatures::IsSupported(*feature)); + CpuFeatureScope scope(assm, *feature); + (assm->*no_avx)(dst, arg, args...); + } else { + (assm->*no_avx)(dst, arg, args...); + } + } + + // Call a method in the AVX form (one more operand), but if unsupported will + // check that dst == first src. + // E.g. Andps(x, y, z) -> vandps(x, y, z) + // -> andps(x, z) and check that x == y + template <void (Assembler::*avx)(Dst, Arg, Args...), void (Assembler::*no_avx)(Dst, Args...)> - void emit(Dst dst, Args... args) { + void emit(Dst dst, Arg arg, Args... args) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); - (assm->*avx)(dst, dst, args...); + (assm->*avx)(dst, arg, args...); } else if (feature.has_value()) { + DCHECK_EQ(dst, arg); DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); (assm->*no_avx)(dst, args...); } else { + DCHECK_EQ(dst, arg); (assm->*no_avx)(dst, args...); } } // Call a method where the AVX version expects no duplicated dst argument. - template <void (Assembler::*avx)(Dst, Args...), - void (Assembler::*no_avx)(Dst, Args...)> - void emit(Dst dst, Args... args) { + // E.g. Movddup(x, y) -> vmovddup(x, y) + // -> movddup(x, y) + template <void (Assembler::*avx)(Dst, Arg, Args...), + void (Assembler::*no_avx)(Dst, Arg, Args...)> + void emit(Dst dst, Arg arg, Args... args) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); - (assm->*avx)(dst, args...); + (assm->*avx)(dst, arg, args...); } else if (feature.has_value()) { DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); - (assm->*no_avx)(dst, args...); + (assm->*no_avx)(dst, arg, args...); } else { - (assm->*no_avx)(dst, args...); + (assm->*no_avx)(dst, arg, args...); } } }; -#define AVX_OP(macro_name, name) \ - template <typename Dst, typename... Args> \ - void macro_name(Dst dst, Args... args) { \ - AvxHelper<Dst, Args...>{this} \ - .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ +#define AVX_OP(macro_name, name) \ + template <typename Dst, typename Arg, typename... Args> \ + void macro_name(Dst dst, Arg arg, Args... args) { \ + AvxHelper<Dst, Arg, Args...>{this} \ + .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ + args...); \ } -#define AVX_OP_SSE3(macro_name, name) \ - template <typename Dst, typename... Args> \ - void macro_name(Dst dst, Args... args) { \ - AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE3)} \ - .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ +#define AVX_OP_SSE3(macro_name, name) \ + template <typename Dst, typename Arg, typename... Args> \ + void macro_name(Dst dst, Arg arg, Args... args) { \ + AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \ + .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ + args...); \ } -#define AVX_OP_SSSE3(macro_name, name) \ - template <typename Dst, typename... Args> \ - void macro_name(Dst dst, Args... args) { \ - AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \ - .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ +#define AVX_OP_SSSE3(macro_name, name) \ + template <typename Dst, typename Arg, typename... Args> \ + void macro_name(Dst dst, Arg arg, Args... args) { \ + AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \ + .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ + args...); \ } -#define AVX_OP_SSE4_1(macro_name, name) \ - template <typename Dst, typename... Args> \ - void macro_name(Dst dst, Args... args) { \ - AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \ - .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ +#define AVX_OP_SSE4_1(macro_name, name) \ + template <typename Dst, typename Arg, typename... Args> \ + void macro_name(Dst dst, Arg arg, Args... args) { \ + AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \ + .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ + args...); \ } -#define AVX_OP_SSE4_2(macro_name, name) \ - template <typename Dst, typename... Args> \ - void macro_name(Dst dst, Args... args) { \ - AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \ - .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ +#define AVX_OP_SSE4_2(macro_name, name) \ + template <typename Dst, typename Arg, typename... Args> \ + void macro_name(Dst dst, Arg arg, Args... args) { \ + AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \ + .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ + args...); \ } + // Keep this list sorted by required extension, then instruction name. + AVX_OP(Addpd, addpd) + AVX_OP(Addps, addps) + AVX_OP(Andnpd, andnpd) + AVX_OP(Andnps, andnps) + AVX_OP(Andpd, andpd) + AVX_OP(Andps, andps) + AVX_OP(Cmpeqpd, cmpeqpd) + AVX_OP(Cmplepd, cmplepd) + AVX_OP(Cmpleps, cmpleps) + AVX_OP(Cmpltpd, cmpltpd) + AVX_OP(Cmpneqpd, cmpneqpd) + AVX_OP(Cmpunordpd, cmpunordpd) + AVX_OP(Cmpunordps, cmpunordps) AVX_OP(Cvtdq2pd, cvtdq2pd) AVX_OP(Cvtdq2ps, cvtdq2ps) - AVX_OP(Cvtps2pd, cvtps2pd) AVX_OP(Cvtpd2ps, cvtpd2ps) + AVX_OP(Cvtps2pd, cvtps2pd) AVX_OP(Cvttps2dq, cvttps2dq) + AVX_OP(Divpd, divpd) + AVX_OP(Divps, divps) + AVX_OP(Maxpd, maxpd) + AVX_OP(Maxps, maxps) + AVX_OP(Minpd, minpd) + AVX_OP(Minps, minps) AVX_OP(Movaps, movaps) AVX_OP(Movd, movd) + AVX_OP(Movhlps, movhlps) AVX_OP(Movhps, movhps) AVX_OP(Movlps, movlps) AVX_OP(Movmskpd, movmskpd) AVX_OP(Movmskps, movmskps) - AVX_OP(Movss, movss) AVX_OP(Movsd, movsd) + AVX_OP(Movss, movss) AVX_OP(Movupd, movupd) AVX_OP(Movups, movups) + AVX_OP(Mulpd, mulpd) + AVX_OP(Mulps, mulps) + AVX_OP(Orpd, orpd) + AVX_OP(Orps, orps) + AVX_OP(Packssdw, packssdw) + AVX_OP(Packsswb, packsswb) + AVX_OP(Packuswb, packuswb) + AVX_OP(Paddb, paddb) + AVX_OP(Paddd, paddd) + AVX_OP(Paddq, paddq) + AVX_OP(Paddsb, paddsb) + AVX_OP(Paddusb, paddusb) + AVX_OP(Paddusw, paddusw) + AVX_OP(Paddw, paddw) + AVX_OP(Pand, pand) + AVX_OP(Pavgb, pavgb) + AVX_OP(Pavgw, pavgw) + AVX_OP(Pcmpgtb, pcmpgtb) + AVX_OP(Pcmpeqd, pcmpeqd) + AVX_OP(Pmaxub, pmaxub) + AVX_OP(Pminub, pminub) AVX_OP(Pmovmskb, pmovmskb) AVX_OP(Pmullw, pmullw) - AVX_OP(Pshuflw, pshuflw) - AVX_OP(Pshufhw, pshufhw) + AVX_OP(Pmuludq, pmuludq) + AVX_OP(Por, por) AVX_OP(Pshufd, pshufd) + AVX_OP(Pshufhw, pshufhw) + AVX_OP(Pshuflw, pshuflw) + AVX_OP(Pslld, pslld) + AVX_OP(Psllq, psllq) + AVX_OP(Psllw, psllw) + AVX_OP(Psrad, psrad) + AVX_OP(Psraw, psraw) + AVX_OP(Psrld, psrld) + AVX_OP(Psrlq, psrlq) + AVX_OP(Psrlw, psrlw) + AVX_OP(Psubb, psubb) + AVX_OP(Psubd, psubd) + AVX_OP(Psubq, psubq) + AVX_OP(Psubsb, psubsb) + AVX_OP(Psubusb, psubusb) + AVX_OP(Psubw, psubw) + AVX_OP(Punpckhbw, punpckhbw) + AVX_OP(Punpckhdq, punpckhdq) + AVX_OP(Punpckhqdq, punpckhqdq) + AVX_OP(Punpckhwd, punpckhwd) + AVX_OP(Punpcklbw, punpcklbw) + AVX_OP(Punpckldq, punpckldq) + AVX_OP(Punpcklqdq, punpcklqdq) + AVX_OP(Punpcklwd, punpcklwd) + AVX_OP(Pxor, pxor) AVX_OP(Rcpps, rcpps) AVX_OP(Rsqrtps, rsqrtps) - AVX_OP(Sqrtps, sqrtps) AVX_OP(Sqrtpd, sqrtpd) + AVX_OP(Sqrtps, sqrtps) + AVX_OP(Sqrtsd, sqrtsd) + AVX_OP(Sqrtss, sqrtss) + AVX_OP(Subpd, subpd) + AVX_OP(Subps, subps) + AVX_OP(Unpcklps, unpcklps) + AVX_OP(Xorpd, xorpd) + AVX_OP(Xorps, xorps) + + AVX_OP_SSE3(Haddps, haddps) AVX_OP_SSE3(Movddup, movddup) AVX_OP_SSE3(Movshdup, movshdup) + AVX_OP_SSSE3(Pabsb, pabsb) - AVX_OP_SSSE3(Pabsw, pabsw) AVX_OP_SSSE3(Pabsd, pabsd) + AVX_OP_SSSE3(Pabsw, pabsw) + AVX_OP_SSSE3(Palignr, palignr) + AVX_OP_SSSE3(Psignb, psignb) + AVX_OP_SSSE3(Psignd, psignd) + AVX_OP_SSSE3(Psignw, psignw) + AVX_OP_SSE4_1(Extractps, extractps) + AVX_OP_SSE4_1(Pblendw, pblendw) AVX_OP_SSE4_1(Pextrb, pextrb) AVX_OP_SSE4_1(Pextrw, pextrw) + AVX_OP_SSE4_1(Pmaxsb, pmaxsb) + AVX_OP_SSE4_1(Pmaxsd, pmaxsd) + AVX_OP_SSE4_1(Pminsb, pminsb) AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw) - AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq) + AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw) - AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd) AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq) + AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd) AVX_OP_SSE4_1(Ptest, ptest) - AVX_OP_SSE4_1(Roundps, roundps) AVX_OP_SSE4_1(Roundpd, roundpd) + AVX_OP_SSE4_1(Roundps, roundps) + void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); + void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep, + uint8_t lane); + void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, + XMMRegister scratch); + void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, + XMMRegister scratch); + void F32x4Splat(XMMRegister dst, DoubleRegister src); + void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scrat, bool is_signed); @@ -170,6 +301,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, XMMRegister scratch); + void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch); void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch); void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1, XMMRegister scratch); @@ -180,6 +312,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src, XMMRegister scratch); + void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch); // Requires dst == mask when AVX is not supported. void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2, XMMRegister scratch); |