| // Copyright 2021 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ |
| #define V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ |
| |
| #include "src/base/macros.h" |
| #include "src/codegen/cpu-features.h" |
| #include "src/codegen/external-reference.h" |
| #include "src/codegen/turbo-assembler.h" |
| |
| #if V8_TARGET_ARCH_IA32 |
| #include "src/codegen/ia32/register-ia32.h" |
| #elif V8_TARGET_ARCH_X64 |
| #include "src/codegen/x64/register-x64.h" |
| #else |
| #error Unsupported target architecture. |
| #endif |
| |
| namespace v8 { |
| namespace internal { |
| class Assembler; |
| |
| // For WebAssembly we care about the full floating point register. If we are not |
| // running Wasm, we can get away with saving half of those registers. |
| #if V8_ENABLE_WEBASSEMBLY |
| constexpr int kStackSavedSavedFPSize = 2 * kDoubleSize; |
| #else |
| constexpr int kStackSavedSavedFPSize = kDoubleSize; |
| #endif // V8_ENABLE_WEBASSEMBLY |
| |
| // Base class for SharedTurboAssemblerBase. This class contains macro-assembler |
| // functions that can be shared across ia32 and x64 without any template |
| // machinery, i.e. does not require the CRTP pattern that |
| // SharedTurboAssemblerBase exposes. This allows us to keep the bulk of |
| // definition inside a separate source file, rather than putting everything |
| // inside this header. |
| class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { |
| public: |
| using TurboAssemblerBase::TurboAssemblerBase; |
| |
| void Move(Register dst, uint32_t src); |
| // Move if registers are not identical. |
| void Move(Register dst, Register src); |
| void Add(Register dst, Immediate src); |
| void And(Register dst, Immediate src); |
| |
| // Will move src1 to dst if AVX is not supported. |
| void Movhps(XMMRegister dst, XMMRegister src1, Operand src2); |
| void Movlps(XMMRegister dst, XMMRegister src1, Operand src2); |
| |
| void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister mask); |
| |
| template <typename Op> |
| void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, |
| uint32_t* load_pc_offset = nullptr) { |
| PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2, |
| imm8, load_pc_offset, {SSE4_1}); |
| } |
| |
| template <typename Op> |
| void Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, |
| uint32_t* load_pc_offset = nullptr) { |
| PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2, |
| imm8, load_pc_offset); |
| } |
| |
| // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE. |
| template <typename Op> |
| void Pshufb(XMMRegister dst, XMMRegister src, Op mask) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpshufb(dst, src, mask); |
| } else { |
| // Make sure these are different so that we won't overwrite mask. |
| DCHECK_NE(mask, dst); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| CpuFeatureScope sse_scope(this, SSSE3); |
| pshufb(dst, mask); |
| } |
| } |
| |
| template <typename Op> |
| void Pshufb(XMMRegister dst, Op mask) { |
| Pshufb(dst, dst, mask); |
| } |
| |
| // Shufps that will mov src1 into dst if AVX is not supported. |
| void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| uint8_t imm8); |
| |
| // Helper struct to implement functions that check for AVX support and |
| // dispatch to the appropriate AVX/SSE instruction. |
| template <typename Dst, typename Arg, typename... Args> |
| struct AvxHelper { |
| Assembler* assm; |
| base::Optional<CpuFeature> feature = base::nullopt; |
| // Call a method where the AVX version expects the dst argument to be |
| // duplicated. |
| // E.g. Andps(x, y) -> vandps(x, x, y) |
| // -> andps(x, y) |
| template <void (Assembler::*avx)(Dst, Dst, Arg, Args...), |
| void (Assembler::*no_avx)(Dst, Arg, Args...)> |
| void emit(Dst dst, Arg arg, Args... args) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(assm, AVX); |
| (assm->*avx)(dst, dst, arg, args...); |
| } else if (feature.has_value()) { |
| DCHECK(CpuFeatures::IsSupported(*feature)); |
| CpuFeatureScope scope(assm, *feature); |
| (assm->*no_avx)(dst, arg, args...); |
| } else { |
| (assm->*no_avx)(dst, arg, args...); |
| } |
| } |
| |
| // Call a method in the AVX form (one more operand), but if unsupported will |
| // check that dst == first src. |
| // E.g. Andps(x, y, z) -> vandps(x, y, z) |
| // -> andps(x, z) and check that x == y |
| template <void (Assembler::*avx)(Dst, Arg, Args...), |
| void (Assembler::*no_avx)(Dst, Args...)> |
| void emit(Dst dst, Arg arg, Args... args) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(assm, AVX); |
| (assm->*avx)(dst, arg, args...); |
| } else if (feature.has_value()) { |
| DCHECK_EQ(dst, arg); |
| DCHECK(CpuFeatures::IsSupported(*feature)); |
| CpuFeatureScope scope(assm, *feature); |
| (assm->*no_avx)(dst, args...); |
| } else { |
| DCHECK_EQ(dst, arg); |
| (assm->*no_avx)(dst, args...); |
| } |
| } |
| |
| // Call a method where the AVX version expects no duplicated dst argument. |
| // E.g. Movddup(x, y) -> vmovddup(x, y) |
| // -> movddup(x, y) |
| template <void (Assembler::*avx)(Dst, Arg, Args...), |
| void (Assembler::*no_avx)(Dst, Arg, Args...)> |
| void emit(Dst dst, Arg arg, Args... args) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(assm, AVX); |
| (assm->*avx)(dst, arg, args...); |
| } else if (feature.has_value()) { |
| DCHECK(CpuFeatures::IsSupported(*feature)); |
| CpuFeatureScope scope(assm, *feature); |
| (assm->*no_avx)(dst, arg, args...); |
| } else { |
| (assm->*no_avx)(dst, arg, args...); |
| } |
| } |
| }; |
| |
| #define AVX_OP(macro_name, name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this} \ |
| .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ |
| args...); \ |
| } |
| |
| // Define a macro which uses |avx_name| when AVX is supported, and |sse_name| |
| // when AVX is not supported. This is useful for bit-wise instructions like |
| // andpd/andps, where the behavior is exactly the same, but the *ps |
| // version is 1 byte shorter, and on SSE-only processors there is no |
| // performance difference since those processors don't differentiate integer |
| // and floating-point domains. |
| // Note: we require |avx_name| to be the AVX instruction without the "v" |
| // prefix. If we require the full AVX instruction name and the caller |
| // accidentally passes in a SSE instruction, we compile without any issues and |
| // generate the SSE instruction. By appending "v" here, we ensure that we will |
| // generate an AVX instruction. |
| #define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this} \ |
| .template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \ |
| dst, arg, args...); \ |
| } |
| |
| #define AVX_OP_SSE3(macro_name, name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \ |
| .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ |
| args...); \ |
| } |
| |
| #define AVX_OP_SSSE3(macro_name, name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \ |
| .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ |
| args...); \ |
| } |
| |
| #define AVX_OP_SSE4_1(macro_name, name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \ |
| .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ |
| args...); \ |
| } |
| |
| #define AVX_OP_SSE4_2(macro_name, name) \ |
| template <typename Dst, typename Arg, typename... Args> \ |
| void macro_name(Dst dst, Arg arg, Args... args) { \ |
| AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \ |
| .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ |
| args...); \ |
| } |
| |
| // Keep this list sorted by required extension, then instruction name. |
| AVX_OP(Addpd, addpd) |
| AVX_OP(Addps, addps) |
| AVX_OP(Addsd, addsd) |
| AVX_OP(Addss, addss) |
| AVX_OP(Andnpd, andnpd) |
| AVX_OP(Andnps, andnps) |
| AVX_OP(Andpd, andpd) |
| AVX_OP(Andps, andps) |
| AVX_OP(Cmpeqpd, cmpeqpd) |
| AVX_OP(Cmpeqps, cmpeqps) |
| AVX_OP(Cmplepd, cmplepd) |
| AVX_OP(Cmpleps, cmpleps) |
| AVX_OP(Cmpltpd, cmpltpd) |
| AVX_OP(Cmpltps, cmpltps) |
| AVX_OP(Cmpneqpd, cmpneqpd) |
| AVX_OP(Cmpneqps, cmpneqps) |
| AVX_OP(Cmpunordpd, cmpunordpd) |
| AVX_OP(Cmpunordps, cmpunordps) |
| AVX_OP(Cvtdq2pd, cvtdq2pd) |
| AVX_OP(Cvtdq2ps, cvtdq2ps) |
| AVX_OP(Cvtpd2ps, cvtpd2ps) |
| AVX_OP(Cvtps2pd, cvtps2pd) |
| AVX_OP(Cvtsd2ss, cvtsd2ss) |
| AVX_OP(Cvtss2sd, cvtss2sd) |
| AVX_OP(Cvttpd2dq, cvttpd2dq) |
| AVX_OP(Cvttps2dq, cvttps2dq) |
| AVX_OP(Cvttsd2si, cvttsd2si) |
| AVX_OP(Cvttss2si, cvttss2si) |
| AVX_OP(Divpd, divpd) |
| AVX_OP(Divps, divps) |
| AVX_OP(Divsd, divsd) |
| AVX_OP(Divss, divss) |
| AVX_OP(Maxpd, maxpd) |
| AVX_OP(Maxps, maxps) |
| AVX_OP(Minpd, minpd) |
| AVX_OP(Minps, minps) |
| AVX_OP(Movaps, movaps) |
| AVX_OP(Movd, movd) |
| AVX_OP(Movhlps, movhlps) |
| AVX_OP(Movhps, movhps) |
| AVX_OP(Movlps, movlps) |
| AVX_OP(Movmskpd, movmskpd) |
| AVX_OP(Movmskps, movmskps) |
| AVX_OP(Movsd, movsd) |
| AVX_OP(Movss, movss) |
| AVX_OP(Movupd, movupd) |
| AVX_OP(Movups, movups) |
| AVX_OP(Mulpd, mulpd) |
| AVX_OP(Mulps, mulps) |
| AVX_OP(Mulsd, mulsd) |
| AVX_OP(Mulss, mulss) |
| AVX_OP(Orpd, orpd) |
| AVX_OP(Orps, orps) |
| AVX_OP(Packssdw, packssdw) |
| AVX_OP(Packsswb, packsswb) |
| AVX_OP(Packuswb, packuswb) |
| AVX_OP(Paddb, paddb) |
| AVX_OP(Paddd, paddd) |
| AVX_OP(Paddq, paddq) |
| AVX_OP(Paddsb, paddsb) |
| AVX_OP(Paddsw, paddsw) |
| AVX_OP(Paddusb, paddusb) |
| AVX_OP(Paddusw, paddusw) |
| AVX_OP(Paddw, paddw) |
| AVX_OP(Pavgb, pavgb) |
| AVX_OP(Pavgw, pavgw) |
| AVX_OP(Pcmpgtb, pcmpgtb) |
| AVX_OP(Pcmpgtd, pcmpgtd) |
| AVX_OP(Pcmpgtw, pcmpgtw) |
| AVX_OP(Pcmpeqb, pcmpeqb) |
| AVX_OP(Pcmpeqd, pcmpeqd) |
| AVX_OP(Pcmpeqw, pcmpeqw) |
| AVX_OP(Pmaddwd, pmaddwd) |
| AVX_OP(Pmaxsw, pmaxsw) |
| AVX_OP(Pmaxub, pmaxub) |
| AVX_OP(Pminsw, pminsw) |
| AVX_OP(Pminub, pminub) |
| AVX_OP(Pmovmskb, pmovmskb) |
| AVX_OP(Pmullw, pmullw) |
| AVX_OP(Pmuludq, pmuludq) |
| AVX_OP(Pshufd, pshufd) |
| AVX_OP(Pshufhw, pshufhw) |
| AVX_OP(Pshuflw, pshuflw) |
| AVX_OP(Pslld, pslld) |
| AVX_OP(Psllq, psllq) |
| AVX_OP(Psllw, psllw) |
| AVX_OP(Psrad, psrad) |
| AVX_OP(Psraw, psraw) |
| AVX_OP(Psrld, psrld) |
| AVX_OP(Psrlq, psrlq) |
| AVX_OP(Psrlw, psrlw) |
| AVX_OP(Psubb, psubb) |
| AVX_OP(Psubd, psubd) |
| AVX_OP(Psubq, psubq) |
| AVX_OP(Psubsb, psubsb) |
| AVX_OP(Psubsw, psubsw) |
| AVX_OP(Psubusb, psubusb) |
| AVX_OP(Psubusw, psubusw) |
| AVX_OP(Psubw, psubw) |
| AVX_OP(Punpckhbw, punpckhbw) |
| AVX_OP(Punpckhdq, punpckhdq) |
| AVX_OP(Punpckhqdq, punpckhqdq) |
| AVX_OP(Punpckhwd, punpckhwd) |
| AVX_OP(Punpcklbw, punpcklbw) |
| AVX_OP(Punpckldq, punpckldq) |
| AVX_OP(Punpcklqdq, punpcklqdq) |
| AVX_OP(Punpcklwd, punpcklwd) |
| AVX_OP(Rcpps, rcpps) |
| AVX_OP(Rsqrtps, rsqrtps) |
| AVX_OP(Sqrtpd, sqrtpd) |
| AVX_OP(Sqrtps, sqrtps) |
| AVX_OP(Sqrtsd, sqrtsd) |
| AVX_OP(Sqrtss, sqrtss) |
| AVX_OP(Subpd, subpd) |
| AVX_OP(Subps, subps) |
| AVX_OP(Subsd, subsd) |
| AVX_OP(Subss, subss) |
| AVX_OP(Ucomisd, ucomisd) |
| AVX_OP(Ucomiss, ucomiss) |
| AVX_OP(Unpcklps, unpcklps) |
| AVX_OP(Xorpd, xorpd) |
| AVX_OP(Xorps, xorps) |
| |
| // Many AVX processors have separate integer/floating-point domains, so use |
| // vmovaps if AVX is supported. On SSE, movaps is 1 byte shorter than movdqa, |
| // and has the same behavior. Most SSE processors also don't have the same |
| // delay moving between integer and floating-point domains. |
| AVX_OP_WITH_DIFF_SSE_INSTR(Movapd, movapd, movaps) |
| AVX_OP_WITH_DIFF_SSE_INSTR(Movdqa, movdqa, movaps) |
| AVX_OP_WITH_DIFF_SSE_INSTR(Movdqu, movdqu, movups) |
| AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps) |
| AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps) |
| AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps) |
| |
| AVX_OP_SSE3(Haddps, haddps) |
| AVX_OP_SSE3(Movddup, movddup) |
| AVX_OP_SSE3(Movshdup, movshdup) |
| |
| AVX_OP_SSSE3(Pabsb, pabsb) |
| AVX_OP_SSSE3(Pabsd, pabsd) |
| AVX_OP_SSSE3(Pabsw, pabsw) |
| AVX_OP_SSSE3(Palignr, palignr) |
| AVX_OP_SSSE3(Pmulhrsw, pmulhrsw) |
| AVX_OP_SSSE3(Psignb, psignb) |
| AVX_OP_SSSE3(Psignd, psignd) |
| AVX_OP_SSSE3(Psignw, psignw) |
| |
| AVX_OP_SSE4_1(Extractps, extractps) |
| AVX_OP_SSE4_1(Insertps, insertps) |
| AVX_OP_SSE4_1(Packusdw, packusdw) |
| AVX_OP_SSE4_1(Pblendw, pblendw) |
| AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq) |
| AVX_OP_SSE4_1(Pextrb, pextrb) |
| AVX_OP_SSE4_1(Pextrw, pextrw) |
| AVX_OP_SSE4_1(Pmaxsb, pmaxsb) |
| AVX_OP_SSE4_1(Pmaxsd, pmaxsd) |
| AVX_OP_SSE4_1(Pmaxud, pmaxud) |
| AVX_OP_SSE4_1(Pmaxuw, pmaxuw) |
| AVX_OP_SSE4_1(Pminsb, pminsb) |
| AVX_OP_SSE4_1(Pminsd, pminsd) |
| AVX_OP_SSE4_1(Pminud, pminud) |
| AVX_OP_SSE4_1(Pminuw, pminuw) |
| AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw) |
| AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq) |
| AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) |
| AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw) |
| AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq) |
| AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd) |
| AVX_OP_SSE4_1(Pmulld, pmulld) |
| AVX_OP_SSE4_1(Ptest, ptest) |
| AVX_OP_SSE4_1(Roundpd, roundpd) |
| AVX_OP_SSE4_1(Roundps, roundps) |
| AVX_OP_SSE4_1(Roundsd, roundsd) |
| AVX_OP_SSE4_1(Roundss, roundss) |
| |
| #undef AVX_OP |
| #undef AVX_OP_SSE3 |
| #undef AVX_OP_SSSE3 |
| #undef AVX_OP_SSE4_1 |
| #undef AVX_OP_SSE4_2 |
| |
| void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); |
| void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep, |
| uint8_t lane); |
| void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, |
| XMMRegister scratch); |
| void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, |
| XMMRegister scratch); |
| void F32x4Splat(XMMRegister dst, DoubleRegister src); |
| void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); |
| void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, |
| XMMRegister scratch); |
| void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, |
| XMMRegister scratch); |
| void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); |
| void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch); |
| void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch); |
| void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, |
| XMMRegister tmp2); |
| void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, |
| XMMRegister tmp2, XMMRegister tmp3); |
| void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2, |
| XMMRegister tmp); |
| void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2, |
| Register tmp1, XMMRegister tmp2, XMMRegister tmp3); |
| void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, |
| XMMRegister tmp2); |
| void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2, |
| Register tmp1, XMMRegister tmp2, XMMRegister tmp3); |
| void I16x8Splat(XMMRegister dst, Register src); |
| void I16x8Splat(XMMRegister dst, Operand src); |
| void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scrat, bool is_signed); |
| void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch); |
| void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch); |
| void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); |
| void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch); |
| // Will move src1 to dst if AVX is not supported. |
| void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch); |
| void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src, |
| XMMRegister tmp); |
| // Requires that dst == src1 if AVX is not supported. |
| void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch, bool low, bool is_signed); |
| void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); |
| void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch); |
| void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch); |
| void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch); |
| void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1, |
| XMMRegister scratch); |
| void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1, |
| XMMRegister scratch); |
| void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift, |
| XMMRegister xmm_tmp); |
| void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift, |
| XMMRegister xmm_tmp, XMMRegister xmm_shift, |
| Register tmp_shift); |
| void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, |
| XMMRegister tmp1, XMMRegister tmp2); |
| void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch, bool low, bool is_signed); |
| void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); |
| void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch); |
| void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch); |
| // Requires dst == mask when AVX is not supported. |
| void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, |
| XMMRegister src2, XMMRegister scratch); |
| void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch); |
| void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch); |
| void S128Load32Splat(XMMRegister dst, Operand src); |
| void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx); |
| |
| void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister src3, XMMRegister tmp); |
| void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister src3, XMMRegister tmp); |
| void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister src3, XMMRegister tmp); |
| void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, |
| XMMRegister src3, XMMRegister tmp); |
| |
| protected: |
| template <typename Op> |
| using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Op, uint8_t); |
| template <typename Op> |
| using NoAvxFn = void (Assembler::*)(XMMRegister, Op, uint8_t); |
| |
| template <typename Op> |
| void PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx, |
| XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, |
| uint32_t* load_pc_offset = nullptr, |
| base::Optional<CpuFeature> feature = base::nullopt) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(assm, AVX); |
| if (load_pc_offset) *load_pc_offset = assm->pc_offset(); |
| (assm->*avx)(dst, src1, src2, imm8); |
| return; |
| } |
| |
| if (dst != src1) assm->movaps(dst, src1); |
| if (load_pc_offset) *load_pc_offset = assm->pc_offset(); |
| if (feature.has_value()) { |
| DCHECK(CpuFeatures::IsSupported(*feature)); |
| CpuFeatureScope scope(assm, *feature); |
| (assm->*noavx)(dst, src2, imm8); |
| } else { |
| (assm->*noavx)(dst, src2, imm8); |
| } |
| } |
| |
| private: |
| template <typename Op> |
| void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch); |
| template <typename Op> |
| void I16x8SplatPreAvx2(XMMRegister dst, Op src); |
| }; |
| |
| // Common base class template shared by ia32 and x64 TurboAssembler. This uses |
| // the Curiously Recurring Template Pattern (CRTP), where Impl is the actual |
| // class (subclass of SharedTurboAssemblerBase instantiated with the actual |
| // class). This allows static polymorphism, where member functions can be move |
| // into SharedTurboAssembler, and we can also call into member functions |
| // defined in ia32 or x64 specific TurboAssembler from within this template |
| // class, via Impl. |
| // |
| // Note: all member functions must be defined in this header file so that the |
| // compiler can generate code for the function definitions. See |
| // https://isocpp.org/wiki/faq/templates#templates-defn-vs-decl for rationale. |
| // If a function does not need polymorphism, move it into SharedTurboAssembler, |
| // and define it outside of this header. |
| template <typename Impl> |
| class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { |
| using SharedTurboAssembler::SharedTurboAssembler; |
| |
| public: |
| void Abspd(XMMRegister dst, XMMRegister src, Register tmp) { |
| FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, |
| ExternalReference::address_of_double_abs_constant()); |
| } |
| |
| void Absps(XMMRegister dst, XMMRegister src, Register tmp) { |
| FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, |
| ExternalReference::address_of_float_abs_constant()); |
| } |
| |
| void Negpd(XMMRegister dst, XMMRegister src, Register tmp) { |
| FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, |
| ExternalReference::address_of_double_neg_constant()); |
| } |
| |
| void Negps(XMMRegister dst, XMMRegister src, Register tmp) { |
| FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, |
| ExternalReference::address_of_float_neg_constant()); |
| } |
| #undef FLOAT_UNOP |
| |
| void Pextrd(Register dst, XMMRegister src, uint8_t imm8) { |
| if (imm8 == 0) { |
| Movd(dst, src); |
| return; |
| } |
| |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vpextrd(dst, src, imm8); |
| } else if (CpuFeatures::IsSupported(SSE4_1)) { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| pextrd(dst, src, imm8); |
| } else { |
| DCHECK_LT(imm8, 2); |
| impl()->PextrdPreSse41(dst, src, imm8); |
| } |
| } |
| |
| template <typename Op> |
| void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, |
| uint32_t* load_pc_offset = nullptr) { |
| if (CpuFeatures::IsSupported(SSE4_1)) { |
| PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, |
| src2, imm8, load_pc_offset, |
| base::Optional<CpuFeature>(SSE4_1)); |
| } else { |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset); |
| } |
| } |
| |
| template <typename Op> |
| void Pinsrd(XMMRegister dst, Op src, uint8_t imm8, |
| uint32_t* load_pc_offset = nullptr) { |
| Pinsrd(dst, dst, src, imm8, load_pc_offset); |
| } |
| |
| void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, |
| Register scratch) { |
| ASM_CODE_COMMENT(this); |
| // dst = [ src_low, 0x43300000, src_high, 0x4330000 ]; |
| // 0x43300000'00000000 is a special double where the significand bits |
| // precisely represents all uint32 numbers. |
| if (!CpuFeatures::IsSupported(AVX) && dst != src) { |
| movaps(dst, src); |
| src = dst; |
| } |
| Unpcklps(dst, src, |
| ExternalReferenceAsOperand( |
| ExternalReference:: |
| address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(), |
| scratch)); |
| Subpd(dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_double_2_power_52(), scratch)); |
| } |
| |
| void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp, |
| Register scratch) { |
| ASM_CODE_COMMENT(this); |
| Operand op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); |
| |
| // This algorithm works by: |
| // 1. lanes with NaNs are zero-ed |
| // 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff |
| // 3. cvttps2dq sets all out of range lanes to 0x8000'0000 |
| // a. correct for underflows (< MIN_INT32) |
| // b. wrong for overflow, and we know which lanes overflow from 2. |
| // 4. adjust for 3b by xor-ing 2 and 3 |
| // a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32) |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vcmpeqps(tmp, src, src); |
| vandps(dst, src, tmp); |
| vcmpgeps(tmp, src, op); |
| vcvttps2dq(dst, dst); |
| vpxor(dst, dst, tmp); |
| } else { |
| if (src == dst) { |
| movaps(tmp, src); |
| cmpeqps(tmp, tmp); |
| andps(dst, tmp); |
| movaps(tmp, op); |
| cmpleps(tmp, dst); |
| cvttps2dq(dst, dst); |
| xorps(dst, tmp); |
| } else { |
| movaps(tmp, op); |
| cmpleps(tmp, src); |
| cvttps2dq(dst, src); |
| xorps(dst, tmp); |
| movaps(tmp, src); |
| cmpeqps(tmp, tmp); |
| andps(dst, tmp); |
| } |
| } |
| } |
| |
| void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch, Register tmp) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| XMMRegister original_dst = dst; |
| // Make sure we don't overwrite src. |
| if (dst == src) { |
| DCHECK_NE(src, scratch); |
| dst = scratch; |
| } |
| // dst = 0 if src == NaN, else all ones. |
| vcmpeqpd(dst, src, src); |
| // dst = 0 if src == NaN, else INT32_MAX as double. |
| vandpd( |
| dst, dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); |
| // dst = 0 if src == NaN, src is saturated to INT32_MAX as double. |
| vminpd(dst, src, dst); |
| // Values > INT32_MAX already saturated, values < INT32_MIN raises an |
| // exception, which is masked and returns 0x80000000. |
| vcvttpd2dq(original_dst, dst); |
| } else { |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| movaps(scratch, dst); |
| cmpeqpd(scratch, dst); |
| andps(scratch, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); |
| minpd(dst, scratch); |
| cvttpd2dq(dst, dst); |
| } |
| } |
| |
| void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch, Register tmp) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vxorpd(scratch, scratch, scratch); |
| // Saturate to 0. |
| vmaxpd(dst, src, scratch); |
| // Saturate to UINT32_MAX. |
| vminpd( |
| dst, dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_uint32_max_as_double(), tmp)); |
| // Truncate. |
| vroundpd(dst, dst, kRoundToZero); |
| // Add to special double where significant bits == uint32. |
| vaddpd(dst, dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_double_2_power_52(), tmp)); |
| // Extract low 32 bits of each double's significand, zero top lanes. |
| // dst = [dst[0], dst[2], 0, 0] |
| vshufps(dst, dst, scratch, 0x88); |
| } else { |
| CpuFeatureScope scope(this, SSE4_1); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| xorps(scratch, scratch); |
| maxpd(dst, scratch); |
| minpd(dst, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_uint32_max_as_double(), |
| tmp)); |
| roundpd(dst, dst, kRoundToZero); |
| addpd(dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_double_2_power_52(), tmp)); |
| shufps(dst, scratch, 0x88); |
| } |
| } |
| |
| void I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp, |
| XMMRegister scratch) { |
| // TODO(zhin): call this from I32x4TruncSatF64x2UZero. |
| ASM_CODE_COMMENT(this); |
| if (dst != src && !CpuFeatures::IsSupported(AVX)) { |
| movaps(dst, src); |
| src = dst; |
| } |
| // Same as I32x4TruncSatF64x2UZero but without the saturation. |
| Roundpd(dst, src, kRoundToZero); |
| // Add to special double where significant bits == uint32. |
| Addpd(dst, dst, |
| ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_double_2_power_52(), tmp)); |
| // Extract low 32 bits of each double's significand, zero top lanes. |
| // dst = [dst[0], dst[2], 0, 0] |
| Shufps(dst, dst, scratch, 0x88); |
| } |
| |
| void I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch, |
| XMMRegister tmp) { |
| ASM_CODE_COMMENT(this); |
| Operand int32_overflow_op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vcmpltps(tmp, src, int32_overflow_op); |
| } else { |
| movaps(tmp, src); |
| cmpltps(tmp, int32_overflow_op); |
| } |
| // In tmp, lanes < INT32_MAX are left alone, other lanes are zeroed. |
| Pand(tmp, src); |
| // tmp = src with all the valid conversions |
| if (dst != src) { |
| Movaps(dst, src); |
| } |
| // In dst, lanes < INT32_MAX are zeroed, other lanes left alone. |
| Pxor(dst, tmp); |
| // tmp contains only lanes which can be converted correctly (<INT32_MAX) |
| Cvttps2dq(tmp, tmp); |
| // Bit-trick follows: |
| // All integers from INT32_MAX to UINT32_MAX that are representable as |
| // floats lie between [0x4f00'0000,0x4f80'0000). |
| // The bit representation of the integers is actually shifted right by 8. |
| // For example given 2147483904.0f (which fits in UINT32_MAX): |
| // |
| // 01001111 000000000 000000000 000000001 (float 0x4f00'0001) |
| // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| // these are exactly the top 24 bits of the int representation |
| // but needs the top bit to be flipped |
| // 10000000 000000000 000000001 000000000 (int 0x8000'0100) |
| // |
| // So what needs to be done is to flip bit 23, which is the lowest bit of |
| // the exponent, which means multiply by 2 (or addps to itself). |
| Addps(dst, dst, dst); |
| // Then shift to get the bit representation of the int. |
| Pslld(dst, byte{8}); |
| // Merge the converted lanes and bit shifted lanes. |
| Paddd(dst, tmp); |
| } |
| |
| void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src, |
| Register scratch) { |
| ASM_CODE_COMMENT(this); |
| Operand op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch); |
| // pmaddwd multiplies signed words in src and op, producing |
| // signed doublewords, then adds pairwise. |
| // src = |a|b|c|d|e|f|g|h| |
| // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 | |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { |
| movaps(dst, src); |
| src = dst; |
| } |
| |
| Pmaddwd(dst, src, op); |
| } |
| |
| void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch, Register tmp) { |
| ASM_CODE_COMMENT(this); |
| // pmaddubsw treats the first operand as unsigned, so pass the external |
| // reference to it as the first operand. |
| Operand op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vmovdqa(scratch, op); |
| vpmaddubsw(dst, scratch, src); |
| } else { |
| CpuFeatureScope sse_scope(this, SSSE3); |
| if (dst == src) { |
| movaps(scratch, op); |
| pmaddubsw(scratch, src); |
| movaps(dst, scratch); |
| } else { |
| movaps(dst, op); |
| pmaddubsw(dst, src); |
| } |
| } |
| } |
| |
| void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src, |
| Register scratch) { |
| ASM_CODE_COMMENT(this); |
| Operand op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpmaddubsw(dst, src, op); |
| } else { |
| CpuFeatureScope sse_scope(this, SSSE3); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| pmaddubsw(dst, op); |
| } |
| } |
| |
| void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask, |
| XMMRegister scratch, Register tmp, bool omit_add = false) { |
| ASM_CODE_COMMENT(this); |
| if (omit_add) { |
| // We have determined that the indices are immediates, and they are either |
| // within bounds, or the top bit is set, so we can omit the add. |
| Pshufb(dst, src, mask); |
| return; |
| } |
| |
| // Out-of-range indices should return 0, add 112 so that any value > 15 |
| // saturates to 128 (top bit set), so pshufb will zero that lane. |
| Operand op = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpaddusb(scratch, mask, op); |
| vpshufb(dst, src, scratch); |
| } else { |
| CpuFeatureScope sse_scope(this, SSSE3); |
| movaps(scratch, op); |
| if (dst != src) { |
| DCHECK_NE(dst, mask); |
| movaps(dst, src); |
| } |
| paddusb(scratch, mask); |
| pshufb(dst, scratch); |
| } |
| } |
| |
| void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, |
| XMMRegister tmp2, Register scratch) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_NE(dst, tmp1); |
| DCHECK_NE(src, tmp1); |
| DCHECK_NE(dst, tmp2); |
| DCHECK_NE(src, tmp2); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vmovdqa(tmp1, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x0f(), |
| scratch)); |
| vpandn(tmp2, tmp1, src); |
| vpand(dst, tmp1, src); |
| vmovdqa(tmp1, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_popcnt_mask(), |
| scratch)); |
| vpsrlw(tmp2, tmp2, 4); |
| vpshufb(dst, tmp1, dst); |
| vpshufb(tmp2, tmp1, tmp2); |
| vpaddb(dst, dst, tmp2); |
| } else if (CpuFeatures::IsSupported(INTEL_ATOM)) { |
| // Pre-Goldmont low-power Intel microarchitectures have very slow |
| // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer |
| // algorithm on these processors. ATOM CPU feature captures exactly |
| // the right set of processors. |
| movaps(tmp1, src); |
| psrlw(tmp1, 1); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| andps(tmp1, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x55(), |
| scratch)); |
| psubb(dst, tmp1); |
| Operand splat_0x33 = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch); |
| movaps(tmp1, dst); |
| andps(dst, splat_0x33); |
| psrlw(tmp1, 2); |
| andps(tmp1, splat_0x33); |
| paddb(dst, tmp1); |
| movaps(tmp1, dst); |
| psrlw(dst, 4); |
| paddb(dst, tmp1); |
| andps(dst, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x0f(), |
| scratch)); |
| } else { |
| CpuFeatureScope sse_scope(this, SSSE3); |
| movaps(tmp1, ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_splat_0x0f(), |
| scratch)); |
| Operand mask = ExternalReferenceAsOperand( |
| ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch); |
| if (tmp2 != tmp1) { |
| movaps(tmp2, tmp1); |
| } |
| andps(tmp1, src); |
| andnps(tmp2, src); |
| psrlw(tmp2, 4); |
| movaps(dst, mask); |
| pshufb(dst, tmp1); |
| movaps(tmp1, mask); |
| pshufb(tmp1, tmp2); |
| paddb(dst, tmp1); |
| } |
| } |
| |
| private: |
| // All implementation-specific methods must be called through this. |
| Impl* impl() { return static_cast<Impl*>(this); } |
| |
| Operand ExternalReferenceAsOperand(ExternalReference reference, |
| Register scratch) { |
| return impl()->ExternalReferenceAsOperand(reference, scratch); |
| } |
| |
| using FloatInstruction = void (SharedTurboAssembler::*)(XMMRegister, |
| XMMRegister, Operand); |
| void FloatUnop(XMMRegister dst, XMMRegister src, Register tmp, |
| FloatInstruction op, ExternalReference ext) { |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { |
| movaps(dst, src); |
| src = dst; |
| } |
| SharedTurboAssembler* assm = this; |
| (assm->*op)(dst, src, ExternalReferenceAsOperand(ext, tmp)); |
| } |
| }; |
| |
| } // namespace internal |
| } // namespace v8 |
| #endif // V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ |