| // Copyright 2021 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h" |
| |
| #include "src/codegen/assembler.h" |
| #include "src/codegen/cpu-features.h" |
| #include "src/codegen/register.h" |
| |
| #if V8_TARGET_ARCH_IA32 |
| #include "src/codegen/ia32/register-ia32.h" |
| #elif V8_TARGET_ARCH_X64 |
| #include "src/codegen/x64/register-x64.h" |
| #else |
| #error Unsupported target architecture. |
| #endif |
| |
| // Operand on IA32 can be a wrapper for a single register, in which case they |
| // should call I8x16Splat |src| being Register. |
| #if V8_TARGET_ARCH_IA32 |
| #define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only()); |
| #else |
| #define DCHECK_OPERAND_IS_NOT_REG(op) |
| #endif |
| |
| namespace v8 { |
| namespace internal { |
| |
| void SharedTurboAssembler::Move(Register dst, uint32_t src) { |
| // Helper to paper over the different assembler function names. |
| #if V8_TARGET_ARCH_IA32 |
| mov(dst, Immediate(src)); |
| #elif V8_TARGET_ARCH_X64 |
| movl(dst, Immediate(src)); |
| #else |
| #error Unsupported target architecture. |
| #endif |
| } |
| |
| void SharedTurboAssembler::Move(Register dst, Register src) { |
| // Helper to paper over the different assembler function names. |
| if (dst != src) { |
| #if V8_TARGET_ARCH_IA32 |
| mov(dst, src); |
| #elif V8_TARGET_ARCH_X64 |
| movq(dst, src); |
| #else |
| #error Unsupported target architecture. |
| #endif |
| } |
| } |
| |
| void SharedTurboAssembler::Add(Register dst, Immediate src) { |
| // Helper to paper over the different assembler function names. |
| #if V8_TARGET_ARCH_IA32 |
| add(dst, src); |
| #elif V8_TARGET_ARCH_X64 |
| addq(dst, src); |
| #else |
| #error Unsupported target architecture. |
| #endif |
| } |
| |
| void SharedTurboAssembler::And(Register dst, Immediate src) { |
| // Helper to paper over the different assembler function names. |
| #if V8_TARGET_ARCH_IA32 |
| and_(dst, src); |
| #elif V8_TARGET_ARCH_X64 |
| andq(dst, src); |
| #else |
| #error Unsupported target architecture. |
| #endif |
| } |
| |
| void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1, |
| Operand src2) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vmovhps(dst, src1, src2); |
| } else { |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| movhps(dst, src2); |
| } |
| } |
| |
| void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1, |
| Operand src2) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vmovlps(dst, src1, src2); |
| } else { |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| movlps(dst, src2); |
| } |
| } |
| |
| void SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister mask) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vpblendvb(dst, src1, src2, mask); |
| } else { |
| CpuFeatureScope scope(this, SSE4_1); |
| DCHECK_EQ(mask, xmm0); |
| DCHECK_EQ(dst, src1); |
| pblendvb(dst, src2); |
| } |
| } |
| |
| void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, uint8_t imm8) { |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vshufps(dst, src1, src2, imm8); |
| } else { |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| shufps(dst, src2, imm8); |
| } |
| } |
| |
| void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src, |
| uint8_t lane) { |
| ASM_CODE_COMMENT(this); |
| if (lane == 0) { |
| if (dst != src) { |
| Movaps(dst, src); |
| } |
| } else { |
| DCHECK_EQ(1, lane); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // Pass src as operand to avoid false-dependency on dst. |
| vmovhlps(dst, src, src); |
| } else { |
| movhlps(dst, src); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src, |
| DoubleRegister rep, uint8_t lane) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| if (lane == 0) { |
| vmovsd(dst, src, rep); |
| } else { |
| vmovlhps(dst, src, rep); |
| } |
| } else { |
| CpuFeatureScope scope(this, SSE4_1); |
| if (dst != src) { |
| DCHECK_NE(dst, rep); // Ensure rep is not overwritten. |
| movaps(dst, src); |
| } |
| if (lane == 0) { |
| movsd(dst, rep); |
| } else { |
| movlhps(dst, rep); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs, |
| XMMRegister rhs, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // The minps instruction doesn't propagate NaNs and +0's in its first |
| // operand. Perform minps in both orders, merge the results, and adjust. |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vminps(scratch, lhs, rhs); |
| vminps(dst, rhs, lhs); |
| } else if (dst == lhs || dst == rhs) { |
| XMMRegister src = dst == lhs ? rhs : lhs; |
| movaps(scratch, src); |
| minps(scratch, dst); |
| minps(dst, src); |
| } else { |
| movaps(scratch, lhs); |
| minps(scratch, rhs); |
| movaps(dst, rhs); |
| minps(dst, lhs); |
| } |
| // Propagate -0's and NaNs, which may be non-canonical. |
| Orps(scratch, dst); |
| // Canonicalize NaNs by quieting and clearing the payload. |
| Cmpunordps(dst, dst, scratch); |
| Orps(scratch, dst); |
| Psrld(dst, dst, byte{10}); |
| Andnps(dst, dst, scratch); |
| } |
| |
| void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs, |
| XMMRegister rhs, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // The maxps instruction doesn't propagate NaNs and +0's in its first |
| // operand. Perform maxps in both orders, merge the results, and adjust. |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vmaxps(scratch, lhs, rhs); |
| vmaxps(dst, rhs, lhs); |
| } else if (dst == lhs || dst == rhs) { |
| XMMRegister src = dst == lhs ? rhs : lhs; |
| movaps(scratch, src); |
| maxps(scratch, dst); |
| maxps(dst, src); |
| } else { |
| movaps(scratch, lhs); |
| maxps(scratch, rhs); |
| movaps(dst, rhs); |
| maxps(dst, lhs); |
| } |
| // Find discrepancies. |
| Xorps(dst, scratch); |
| // Propagate NaNs, which may be non-canonical. |
| Orps(scratch, dst); |
| // Propagate sign discrepancy and (subtle) quiet NaNs. |
| Subps(scratch, scratch, dst); |
| // Canonicalize NaNs by clearing the payload. Sign is non-deterministic. |
| Cmpunordps(dst, dst, scratch); |
| Psrld(dst, dst, byte{10}); |
| Andnps(dst, dst, scratch); |
| } |
| |
| void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs, |
| XMMRegister rhs, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| // The minpd instruction doesn't propagate NaNs and +0's in its first |
| // operand. Perform minpd in both orders, merge the resuls, and adjust. |
| vminpd(scratch, lhs, rhs); |
| vminpd(dst, rhs, lhs); |
| // propagate -0's and NaNs, which may be non-canonical. |
| vorpd(scratch, scratch, dst); |
| // Canonicalize NaNs by quieting and clearing the payload. |
| vcmpunordpd(dst, dst, scratch); |
| vorpd(scratch, scratch, dst); |
| vpsrlq(dst, dst, byte{13}); |
| vandnpd(dst, dst, scratch); |
| } else { |
| // Compare lhs with rhs, and rhs with lhs, and have the results in scratch |
| // and dst. If dst overlaps with lhs or rhs, we can save a move. |
| if (dst == lhs || dst == rhs) { |
| XMMRegister src = dst == lhs ? rhs : lhs; |
| movaps(scratch, src); |
| minpd(scratch, dst); |
| minpd(dst, src); |
| } else { |
| movaps(scratch, lhs); |
| movaps(dst, rhs); |
| minpd(scratch, rhs); |
| minpd(dst, lhs); |
| } |
| orpd(scratch, dst); |
| cmpunordpd(dst, scratch); |
| orpd(scratch, dst); |
| psrlq(dst, byte{13}); |
| andnpd(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs, |
| XMMRegister rhs, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| // The maxpd instruction doesn't propagate NaNs and +0's in its first |
| // operand. Perform maxpd in both orders, merge the resuls, and adjust. |
| vmaxpd(scratch, lhs, rhs); |
| vmaxpd(dst, rhs, lhs); |
| // Find discrepancies. |
| vxorpd(dst, dst, scratch); |
| // Propagate NaNs, which may be non-canonical. |
| vorpd(scratch, scratch, dst); |
| // Propagate sign discrepancy and (subtle) quiet NaNs. |
| vsubpd(scratch, scratch, dst); |
| // Canonicalize NaNs by clearing the payload. Sign is non-deterministic. |
| vcmpunordpd(dst, dst, scratch); |
| vpsrlq(dst, dst, byte{13}); |
| vandnpd(dst, dst, scratch); |
| } else { |
| if (dst == lhs || dst == rhs) { |
| XMMRegister src = dst == lhs ? rhs : lhs; |
| movaps(scratch, src); |
| maxpd(scratch, dst); |
| maxpd(dst, src); |
| } else { |
| movaps(scratch, lhs); |
| movaps(dst, rhs); |
| maxpd(scratch, rhs); |
| maxpd(dst, lhs); |
| } |
| xorpd(dst, scratch); |
| orpd(scratch, dst); |
| subpd(scratch, dst); |
| cmpunordpd(dst, scratch); |
| psrlq(dst, byte{13}); |
| andnpd(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| vbroadcastss(dst, src); |
| } else if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vshufps(dst, src, src, 0); |
| } else { |
| if (dst == src) { |
| // 1 byte shorter than pshufd. |
| shufps(dst, src, 0); |
| } else { |
| pshufd(dst, src, 0); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src, |
| uint8_t lane) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_LT(lane, 4); |
| // These instructions are shorter than insertps, but will leave junk in |
| // the top lanes of dst. |
| if (lane == 0) { |
| if (dst != src) { |
| Movaps(dst, src); |
| } |
| } else if (lane == 1) { |
| Movshdup(dst, src); |
| } else if (lane == 2 && dst == src) { |
| // Check dst == src to avoid false dependency on dst. |
| Movhlps(dst, src); |
| } else if (dst == src) { |
| Shufps(dst, src, src, lane); |
| } else { |
| Pshufd(dst, src, lane); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, |
| uint8_t laneidx) { |
| ASM_CODE_COMMENT(this); |
| if (laneidx == 0) { |
| Movss(dst, src); |
| } else { |
| DCHECK_GE(3, laneidx); |
| Extractps(dst, src, laneidx); |
| } |
| } |
| |
| template <typename Op> |
| void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| DCHECK(!CpuFeatures::IsSupported(AVX2)); |
| CpuFeatureScope ssse3_scope(this, SSSE3); |
| Movd(dst, src); |
| Xorps(scratch, scratch); |
| Pshufb(dst, scratch); |
| } |
| |
| void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| Movd(scratch, src); |
| vpbroadcastb(dst, scratch); |
| } else { |
| I8x16SplatPreAvx2(dst, src, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_OPERAND_IS_NOT_REG(src); |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| vpbroadcastb(dst, src); |
| } else { |
| I8x16SplatPreAvx2(dst, src, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1, |
| uint8_t src2, Register tmp1, |
| XMMRegister tmp2) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_NE(dst, tmp2); |
| // Perform 16-bit shift, then mask away low bits. |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { |
| movaps(dst, src1); |
| src1 = dst; |
| } |
| |
| uint8_t shift = truncate_to_int3(src2); |
| Psllw(dst, src1, byte{shift}); |
| |
| uint8_t bmask = static_cast<uint8_t>(0xff << shift); |
| uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask; |
| Move(tmp1, mask); |
| Movd(tmp2, tmp1); |
| Pshufd(tmp2, tmp2, uint8_t{0}); |
| Pand(dst, tmp2); |
| } |
| |
| void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1, |
| Register src2, Register tmp1, |
| XMMRegister tmp2, XMMRegister tmp3) { |
| ASM_CODE_COMMENT(this); |
| DCHECK(!AreAliased(dst, tmp2, tmp3)); |
| DCHECK(!AreAliased(src1, tmp2, tmp3)); |
| |
| // Take shift value modulo 8. |
| Move(tmp1, src2); |
| And(tmp1, Immediate(7)); |
| Add(tmp1, Immediate(8)); |
| // Create a mask to unset high bits. |
| Movd(tmp3, tmp1); |
| Pcmpeqd(tmp2, tmp2); |
| Psrlw(tmp2, tmp2, tmp3); |
| Packuswb(tmp2, tmp2); |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { |
| movaps(dst, src1); |
| src1 = dst; |
| } |
| // Mask off the unwanted bits before word-shifting. |
| Pand(dst, src1, tmp2); |
| Add(tmp1, Immediate(-8)); |
| Movd(tmp3, tmp1); |
| Psllw(dst, dst, tmp3); |
| } |
| |
| void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1, |
| uint8_t src2, XMMRegister tmp) { |
| ASM_CODE_COMMENT(this); |
| // Unpack bytes into words, do word (16-bit) shifts, and repack. |
| DCHECK_NE(dst, tmp); |
| uint8_t shift = truncate_to_int3(src2) + 8; |
| |
| Punpckhbw(tmp, src1); |
| Punpcklbw(dst, src1); |
| Psraw(tmp, shift); |
| Psraw(dst, shift); |
| Packsswb(dst, tmp); |
| } |
| |
| void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1, |
| Register src2, Register tmp1, |
| XMMRegister tmp2, XMMRegister tmp3) { |
| ASM_CODE_COMMENT(this); |
| DCHECK(!AreAliased(dst, tmp2, tmp3)); |
| DCHECK_NE(src1, tmp2); |
| |
| // Unpack the bytes into words, do arithmetic shifts, and repack. |
| Punpckhbw(tmp2, src1); |
| Punpcklbw(dst, src1); |
| // Prepare shift value |
| Move(tmp1, src2); |
| // Take shift value modulo 8. |
| And(tmp1, Immediate(7)); |
| Add(tmp1, Immediate(8)); |
| Movd(tmp3, tmp1); |
| Psraw(tmp2, tmp3); |
| Psraw(dst, tmp3); |
| Packsswb(dst, tmp2); |
| } |
| |
| void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1, |
| uint8_t src2, Register tmp1, |
| XMMRegister tmp2) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_NE(dst, tmp2); |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { |
| movaps(dst, src1); |
| src1 = dst; |
| } |
| |
| // Perform 16-bit shift, then mask away high bits. |
| uint8_t shift = truncate_to_int3(src2); |
| Psrlw(dst, src1, shift); |
| |
| uint8_t bmask = 0xff >> shift; |
| uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask; |
| Move(tmp1, mask); |
| Movd(tmp2, tmp1); |
| Pshufd(tmp2, tmp2, byte{0}); |
| Pand(dst, tmp2); |
| } |
| |
| void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1, |
| Register src2, Register tmp1, |
| XMMRegister tmp2, XMMRegister tmp3) { |
| ASM_CODE_COMMENT(this); |
| DCHECK(!AreAliased(dst, tmp2, tmp3)); |
| DCHECK_NE(src1, tmp2); |
| |
| // Unpack the bytes into words, do logical shifts, and repack. |
| Punpckhbw(tmp2, src1); |
| Punpcklbw(dst, src1); |
| // Prepare shift value. |
| Move(tmp1, src2); |
| // Take shift value modulo 8. |
| And(tmp1, Immediate(7)); |
| Add(tmp1, Immediate(8)); |
| Movd(tmp3, tmp1); |
| Psrlw(tmp2, tmp3); |
| Psrlw(dst, tmp3); |
| Packuswb(dst, tmp2); |
| } |
| |
| template <typename Op> |
| void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) { |
| DCHECK(!CpuFeatures::IsSupported(AVX2)); |
| Movd(dst, src); |
| Pshuflw(dst, dst, uint8_t{0x0}); |
| Punpcklqdq(dst, dst); |
| } |
| |
| void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| Movd(dst, src); |
| vpbroadcastw(dst, dst); |
| } else { |
| I16x8SplatPreAvx2(dst, src); |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_OPERAND_IS_NOT_REG(src); |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| vpbroadcastw(dst, src); |
| } else { |
| I16x8SplatPreAvx2(dst, src); |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister scratch, |
| bool is_signed) { |
| ASM_CODE_COMMENT(this); |
| is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1); |
| is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2); |
| Pmullw(dst, scratch); |
| } |
| |
| void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpunpckhbw(scratch, src1, src1); |
| vpsraw(scratch, scratch, 8); |
| vpunpckhbw(dst, src2, src2); |
| vpsraw(dst, dst, 8); |
| vpmullw(dst, dst, scratch); |
| } else { |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| movaps(scratch, src2); |
| punpckhbw(dst, dst); |
| psraw(dst, 8); |
| punpckhbw(scratch, scratch); |
| psraw(scratch, 8); |
| pmullw(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // The logic here is slightly complicated to handle all the cases of register |
| // aliasing. This allows flexibility for callers in TurboFan and Liftoff. |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| if (src1 == src2) { |
| vpxor(scratch, scratch, scratch); |
| vpunpckhbw(dst, src1, scratch); |
| vpmullw(dst, dst, dst); |
| } else { |
| if (dst == src2) { |
| // We overwrite dst, then use src2, so swap src1 and src2. |
| std::swap(src1, src2); |
| } |
| vpxor(scratch, scratch, scratch); |
| vpunpckhbw(dst, src1, scratch); |
| vpunpckhbw(scratch, src2, scratch); |
| vpmullw(dst, dst, scratch); |
| } |
| } else { |
| if (src1 == src2) { |
| xorps(scratch, scratch); |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| punpckhbw(dst, scratch); |
| pmullw(dst, scratch); |
| } else { |
| // When dst == src1, nothing special needs to be done. |
| // When dst == src2, swap src1 and src2, since we overwrite dst. |
| // When dst is unique, copy src1 to dst first. |
| if (dst == src2) { |
| std::swap(src1, src2); |
| // Now, dst == src1. |
| } else if (dst != src1) { |
| // dst != src1 && dst != src2. |
| movaps(dst, src1); |
| } |
| xorps(scratch, scratch); |
| punpckhbw(dst, scratch); |
| punpckhbw(scratch, src2); |
| psrlw(scratch, 8); |
| pmullw(dst, scratch); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, |
| XMMRegister src) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high) |
| // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p| |
| vpunpckhbw(dst, src, src); |
| vpsraw(dst, dst, 8); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| if (dst == src) { |
| // 2 bytes shorter than pshufd, but has depdency on dst. |
| movhlps(dst, src); |
| pmovsxbw(dst, dst); |
| } else { |
| // No dependency on dst. |
| pshufd(dst, src, 0xEE); |
| pmovsxbw(dst, dst); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, |
| XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0| |
| // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p| |
| // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h| |
| XMMRegister tmp = dst == src ? scratch : dst; |
| vpxor(tmp, tmp, tmp); |
| vpunpckhbw(dst, src, tmp); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| if (dst == src) { |
| // xorps can be executed on more ports than pshufd. |
| xorps(scratch, scratch); |
| punpckhbw(dst, scratch); |
| } else { |
| // No dependency on dst. |
| pshufd(dst, src, 0xEE); |
| pmovzxbw(dst, dst); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // k = i16x8.splat(0x8000) |
| Pcmpeqd(scratch, scratch); |
| Psllw(scratch, scratch, byte{15}); |
| |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { |
| movaps(dst, src1); |
| src1 = dst; |
| } |
| |
| Pmulhrsw(dst, src1, src2); |
| Pcmpeqw(scratch, dst); |
| Pxor(dst, scratch); |
| } |
| |
| void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, |
| XMMRegister src, |
| XMMRegister tmp) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // src = |a|b|c|d|e|f|g|h| (low) |
| // scratch = |0|a|0|c|0|e|0|g| |
| vpsrld(tmp, src, 16); |
| // dst = |0|b|0|d|0|f|0|h| |
| vpblendw(dst, src, tmp, 0xAA); |
| // dst = |a+b|c+d|e+f|g+h| |
| vpaddd(dst, tmp, dst); |
| } else if (CpuFeatures::IsSupported(SSE4_1)) { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| // There is a potentially better lowering if we get rip-relative |
| // constants, see https://github.com/WebAssembly/simd/pull/380. |
| movaps(tmp, src); |
| psrld(tmp, 16); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| pblendw(dst, tmp, 0xAA); |
| paddd(dst, tmp); |
| } else { |
| // src = |a|b|c|d|e|f|g|h| |
| // tmp = i32x4.splat(0x0000FFFF) |
| pcmpeqd(tmp, tmp); |
| psrld(tmp, byte{16}); |
| // tmp =|0|b|0|d|0|f|0|h| |
| andps(tmp, src); |
| // dst = |0|a|0|c|0|e|0|g| |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| psrld(dst, byte{16}); |
| // dst = |a+b|c+d|e+f|g+h| |
| paddd(dst, tmp); |
| } |
| } |
| |
| // 1. Multiply low word into scratch. |
| // 2. Multiply high word (can be signed or unsigned) into dst. |
| // 3. Unpack and interleave scratch and dst into dst. |
| void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister scratch, |
| bool low, bool is_signed) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpmullw(scratch, src1, src2); |
| is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); |
| low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); |
| } else { |
| DCHECK_EQ(dst, src1); |
| movaps(scratch, src1); |
| pmullw(dst, src2); |
| is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); |
| low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, |
| XMMRegister src) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // src = |a|b|c|d|e|f|g|h| (high) |
| // dst = |e|e|f|f|g|g|h|h| |
| vpunpckhwd(dst, src, src); |
| vpsrad(dst, dst, 16); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| if (dst == src) { |
| // 2 bytes shorter than pshufd, but has depdency on dst. |
| movhlps(dst, src); |
| pmovsxwd(dst, dst); |
| } else { |
| // No dependency on dst. |
| pshufd(dst, src, 0xEE); |
| pmovsxwd(dst, dst); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, |
| XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // scratch = |0|0|0|0|0|0|0|0| |
| // src = |a|b|c|d|e|f|g|h| |
| // dst = |0|a|0|b|0|c|0|d| |
| XMMRegister tmp = dst == src ? scratch : dst; |
| vpxor(tmp, tmp, tmp); |
| vpunpckhwd(dst, src, tmp); |
| } else { |
| if (dst == src) { |
| // xorps can be executed on more ports than pshufd. |
| xorps(scratch, scratch); |
| punpckhwd(dst, scratch); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| // No dependency on dst. |
| pshufd(dst, src, 0xEE); |
| pmovzxwd(dst, dst); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope scope(this, AVX); |
| vpxor(scratch, scratch, scratch); |
| vpsubq(dst, scratch, src); |
| } else { |
| if (dst == src) { |
| movaps(scratch, src); |
| std::swap(src, scratch); |
| } |
| pxor(dst, dst); |
| psubq(dst, src); |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| XMMRegister tmp = dst == src ? scratch : dst; |
| vpxor(tmp, tmp, tmp); |
| vpsubq(tmp, tmp, src); |
| vblendvpd(dst, src, tmp, src); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE3); |
| movshdup(scratch, src); |
| if (dst != src) { |
| movaps(dst, src); |
| } |
| psrad(scratch, 31); |
| xorps(dst, scratch); |
| psubq(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0, |
| XMMRegister src1, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpcmpgtq(dst, src0, src1); |
| } else if (CpuFeatures::IsSupported(SSE4_2)) { |
| CpuFeatureScope sse_scope(this, SSE4_2); |
| if (dst == src0) { |
| pcmpgtq(dst, src1); |
| } else if (dst == src1) { |
| movaps(scratch, src0); |
| pcmpgtq(scratch, src1); |
| movaps(dst, scratch); |
| } else { |
| movaps(dst, src0); |
| pcmpgtq(dst, src1); |
| } |
| } else { |
| CpuFeatureScope sse_scope(this, SSE3); |
| DCHECK_NE(dst, src0); |
| DCHECK_NE(dst, src1); |
| movaps(dst, src1); |
| movaps(scratch, src0); |
| psubq(dst, src0); |
| pcmpeqd(scratch, src1); |
| andps(dst, scratch); |
| movaps(scratch, src0); |
| pcmpgtd(scratch, src1); |
| orps(dst, scratch); |
| movshdup(dst, dst); |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0, |
| XMMRegister src1, XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpcmpgtq(dst, src1, src0); |
| vpcmpeqd(scratch, scratch, scratch); |
| vpxor(dst, dst, scratch); |
| } else if (CpuFeatures::IsSupported(SSE4_2)) { |
| CpuFeatureScope sse_scope(this, SSE4_2); |
| DCHECK_NE(dst, src0); |
| if (dst != src1) { |
| movaps(dst, src1); |
| } |
| pcmpgtq(dst, src0); |
| pcmpeqd(scratch, scratch); |
| xorps(dst, scratch); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE3); |
| DCHECK_NE(dst, src0); |
| DCHECK_NE(dst, src1); |
| movaps(dst, src0); |
| movaps(scratch, src1); |
| psubq(dst, src1); |
| pcmpeqd(scratch, src0); |
| andps(dst, scratch); |
| movaps(scratch, src1); |
| pcmpgtd(scratch, src0); |
| orps(dst, scratch); |
| movshdup(dst, dst); |
| pcmpeqd(scratch, scratch); |
| xorps(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src, |
| uint8_t shift, XMMRegister xmm_tmp) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_GT(64, shift); |
| DCHECK_NE(xmm_tmp, dst); |
| DCHECK_NE(xmm_tmp, src); |
| // Use logical right shift to emulate arithmetic right shifts: |
| // Given: |
| // signed >> c |
| // == (signed + 2^63 - 2^63) >> c |
| // == ((signed + 2^63) >> c) - (2^63 >> c) |
| // ^^^^^^^^^ |
| // xmm_tmp |
| // signed + 2^63 is an unsigned number, so we can use logical right shifts. |
| |
| // xmm_tmp = wasm_i64x2_const(0x80000000'00000000). |
| Pcmpeqd(xmm_tmp, xmm_tmp); |
| Psllq(xmm_tmp, byte{63}); |
| |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { |
| movaps(dst, src); |
| src = dst; |
| } |
| // Add a bias of 2^63 to convert signed to unsigned. |
| // Since only highest bit changes, use pxor instead of paddq. |
| Pxor(dst, src, xmm_tmp); |
| // Logically shift both value and bias. |
| Psrlq(dst, shift); |
| Psrlq(xmm_tmp, shift); |
| // Subtract shifted bias to convert back to signed value. |
| Psubq(dst, xmm_tmp); |
| } |
| |
| void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src, |
| Register shift, XMMRegister xmm_tmp, |
| XMMRegister xmm_shift, |
| Register tmp_shift) { |
| ASM_CODE_COMMENT(this); |
| DCHECK_NE(xmm_tmp, dst); |
| DCHECK_NE(xmm_tmp, src); |
| DCHECK_NE(xmm_shift, dst); |
| DCHECK_NE(xmm_shift, src); |
| // tmp_shift can alias shift since we don't use shift after masking it. |
| |
| // See I64x2ShrS with constant shift for explanation of this algorithm. |
| Pcmpeqd(xmm_tmp, xmm_tmp); |
| Psllq(xmm_tmp, byte{63}); |
| |
| // Shift modulo 64. |
| Move(tmp_shift, shift); |
| And(tmp_shift, Immediate(0x3F)); |
| Movd(xmm_shift, tmp_shift); |
| |
| if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { |
| movaps(dst, src); |
| src = dst; |
| } |
| Pxor(dst, src, xmm_tmp); |
| Psrlq(dst, xmm_shift); |
| Psrlq(xmm_tmp, xmm_shift); |
| Psubq(dst, xmm_tmp); |
| } |
| |
| void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs, |
| XMMRegister rhs, XMMRegister tmp1, |
| XMMRegister tmp2) { |
| ASM_CODE_COMMENT(this); |
| DCHECK(!AreAliased(dst, tmp1, tmp2)); |
| DCHECK(!AreAliased(lhs, tmp1, tmp2)); |
| DCHECK(!AreAliased(rhs, tmp1, tmp2)); |
| |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // 1. Multiply high dword of each qword of left with right. |
| vpsrlq(tmp1, lhs, byte{32}); |
| vpmuludq(tmp1, tmp1, rhs); |
| // 2. Multiply high dword of each qword of right with left. |
| vpsrlq(tmp2, rhs, byte{32}); |
| vpmuludq(tmp2, tmp2, lhs); |
| // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result). |
| vpaddq(tmp2, tmp2, tmp1); |
| vpsllq(tmp2, tmp2, byte{32}); |
| // 4. Multiply low dwords (this is the low dword of result). |
| vpmuludq(dst, lhs, rhs); |
| // 5. Add 3 and 4. |
| vpaddq(dst, dst, tmp2); |
| } else { |
| // Same algorithm as AVX version, but with moves to not overwrite inputs. |
| movaps(tmp1, lhs); |
| movaps(tmp2, rhs); |
| psrlq(tmp1, byte{32}); |
| pmuludq(tmp1, rhs); |
| psrlq(tmp2, byte{32}); |
| pmuludq(tmp2, lhs); |
| paddq(tmp2, tmp1); |
| psllq(tmp2, byte{32}); |
| if (dst == rhs) { |
| // pmuludq is commutative |
| pmuludq(dst, lhs); |
| } else { |
| if (dst != lhs) { |
| movaps(dst, lhs); |
| } |
| pmuludq(dst, rhs); |
| } |
| paddq(dst, tmp2); |
| } |
| } |
| |
| // 1. Unpack src0, src1 into even-number elements of scratch. |
| // 2. Unpack src1, src0 into even-number elements of dst. |
| // 3. Multiply 1. with 2. |
| // For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. |
| void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister scratch, |
| bool low, bool is_signed) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| if (low) { |
| vpunpckldq(scratch, src1, src1); |
| vpunpckldq(dst, src2, src2); |
| } else { |
| vpunpckhdq(scratch, src1, src1); |
| vpunpckhdq(dst, src2, src2); |
| } |
| if (is_signed) { |
| vpmuldq(dst, scratch, dst); |
| } else { |
| vpmuludq(dst, scratch, dst); |
| } |
| } else { |
| uint8_t mask = low ? 0x50 : 0xFA; |
| pshufd(scratch, src1, mask); |
| pshufd(dst, src2, mask); |
| if (is_signed) { |
| CpuFeatureScope sse4_scope(this, SSE4_1); |
| pmuldq(dst, scratch); |
| } else { |
| pmuludq(dst, scratch); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst, |
| XMMRegister src) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpunpckhqdq(dst, src, src); |
| vpmovsxdq(dst, dst); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| if (dst == src) { |
| movhlps(dst, src); |
| } else { |
| pshufd(dst, src, 0xEE); |
| } |
| pmovsxdq(dst, dst); |
| } |
| } |
| |
| void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst, |
| XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpxor(scratch, scratch, scratch); |
| vpunpckhdq(dst, src, scratch); |
| } else { |
| if (dst == src) { |
| // xorps can be executed on more ports than pshufd. |
| xorps(scratch, scratch); |
| punpckhdq(dst, scratch); |
| } else { |
| CpuFeatureScope sse_scope(this, SSE4_1); |
| // No dependency on dst. |
| pshufd(dst, src, 0xEE); |
| pmovzxdq(dst, dst); |
| } |
| } |
| } |
| |
| void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| if (dst == src) { |
| Pcmpeqd(scratch, scratch); |
| Pxor(dst, scratch); |
| } else { |
| Pcmpeqd(dst, dst); |
| Pxor(dst, src); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, |
| XMMRegister src1, XMMRegister src2, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)). |
| // pandn(x, y) = !x & y, so we have to flip the mask and input. |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vpandn(scratch, mask, src2); |
| vpand(dst, src1, mask); |
| vpor(dst, dst, scratch); |
| } else { |
| DCHECK_EQ(dst, mask); |
| // Use float ops as they are 1 byte shorter than int ops. |
| movaps(scratch, mask); |
| andnps(scratch, src2); |
| andps(dst, src1); |
| orps(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // The trap handler uses the current pc to creating a landing, so that it can |
| // determine if a trap occured in Wasm code due to a OOB load. Make sure the |
| // first instruction in each case below is the one that loads. |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| vpbroadcastb(dst, src); |
| } else if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // Avoid dependency on previous value of dst. |
| vpinsrb(dst, scratch, src, uint8_t{0}); |
| vpxor(scratch, scratch, scratch); |
| vpshufb(dst, dst, scratch); |
| } else { |
| CpuFeatureScope ssse4_scope(this, SSE4_1); |
| pinsrb(dst, src, uint8_t{0}); |
| xorps(scratch, scratch); |
| pshufb(dst, scratch); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src, |
| XMMRegister scratch) { |
| ASM_CODE_COMMENT(this); |
| // The trap handler uses the current pc to creating a landing, so that it can |
| // determine if a trap occured in Wasm code due to a OOB load. Make sure the |
| // first instruction in each case below is the one that loads. |
| if (CpuFeatures::IsSupported(AVX2)) { |
| CpuFeatureScope avx2_scope(this, AVX2); |
| vpbroadcastw(dst, src); |
| } else if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| // Avoid dependency on previous value of dst. |
| vpinsrw(dst, scratch, src, uint8_t{0}); |
| vpshuflw(dst, dst, uint8_t{0}); |
| vpunpcklqdq(dst, dst, dst); |
| } else { |
| pinsrw(dst, src, uint8_t{0}); |
| pshuflw(dst, dst, uint8_t{0}); |
| movlhps(dst, dst); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) { |
| ASM_CODE_COMMENT(this); |
| // The trap handler uses the current pc to creating a landing, so that it can |
| // determine if a trap occured in Wasm code due to a OOB load. Make sure the |
| // first instruction in each case below is the one that loads. |
| if (CpuFeatures::IsSupported(AVX)) { |
| CpuFeatureScope avx_scope(this, AVX); |
| vbroadcastss(dst, src); |
| } else { |
| movss(dst, src); |
| shufps(dst, dst, byte{0}); |
| } |
| } |
| |
| void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src, |
| uint8_t laneidx) { |
| ASM_CODE_COMMENT(this); |
| if (laneidx == 0) { |
| Movlps(dst, src); |
| } else { |
| DCHECK_EQ(1, laneidx); |
| Movhps(dst, src); |
| } |
| } |
| |
| // Helper macro to define qfma macro-assembler. This takes care of every |
| // possible case of register aliasing to minimize the number of instructions. |
| #define QFMA(ps_or_pd) \ |
| if (CpuFeatures::IsSupported(FMA3)) { \ |
| CpuFeatureScope fma3_scope(this, FMA3); \ |
| if (dst == src1) { \ |
| vfmadd231##ps_or_pd(dst, src2, src3); \ |
| } else if (dst == src2) { \ |
| vfmadd132##ps_or_pd(dst, src1, src3); \ |
| } else if (dst == src3) { \ |
| vfmadd213##ps_or_pd(dst, src2, src1); \ |
| } else { \ |
| CpuFeatureScope avx_scope(this, AVX); \ |
| vmovups(dst, src1); \ |
| vfmadd231##ps_or_pd(dst, src2, src3); \ |
| } \ |
| } else if (CpuFeatures::IsSupported(AVX)) { \ |
| CpuFeatureScope avx_scope(this, AVX); \ |
| vmul##ps_or_pd(tmp, src2, src3); \ |
| vadd##ps_or_pd(dst, src1, tmp); \ |
| } else { \ |
| if (dst == src1) { \ |
| movaps(tmp, src2); \ |
| mul##ps_or_pd(tmp, src3); \ |
| add##ps_or_pd(dst, tmp); \ |
| } else if (dst == src2) { \ |
| DCHECK_NE(src2, src1); \ |
| mul##ps_or_pd(src2, src3); \ |
| add##ps_or_pd(src2, src1); \ |
| } else if (dst == src3) { \ |
| DCHECK_NE(src3, src1); \ |
| mul##ps_or_pd(src3, src2); \ |
| add##ps_or_pd(src3, src1); \ |
| } else { \ |
| movaps(dst, src2); \ |
| mul##ps_or_pd(dst, src3); \ |
| add##ps_or_pd(dst, src1); \ |
| } \ |
| } |
| |
| // Helper macro to define qfms macro-assembler. This takes care of every |
| // possible case of register aliasing to minimize the number of instructions. |
| #define QFMS(ps_or_pd) \ |
| if (CpuFeatures::IsSupported(FMA3)) { \ |
| CpuFeatureScope fma3_scope(this, FMA3); \ |
| if (dst == src1) { \ |
| vfnmadd231##ps_or_pd(dst, src2, src3); \ |
| } else if (dst == src2) { \ |
| vfnmadd132##ps_or_pd(dst, src1, src3); \ |
| } else if (dst == src3) { \ |
| vfnmadd213##ps_or_pd(dst, src2, src1); \ |
| } else { \ |
| CpuFeatureScope avx_scope(this, AVX); \ |
| vmovups(dst, src1); \ |
| vfnmadd231##ps_or_pd(dst, src2, src3); \ |
| } \ |
| } else if (CpuFeatures::IsSupported(AVX)) { \ |
| CpuFeatureScope avx_scope(this, AVX); \ |
| vmul##ps_or_pd(tmp, src2, src3); \ |
| vsub##ps_or_pd(dst, src1, tmp); \ |
| } else { \ |
| movaps(tmp, src2); \ |
| mul##ps_or_pd(tmp, src3); \ |
| if (dst != src1) { \ |
| movaps(dst, src1); \ |
| } \ |
| sub##ps_or_pd(dst, tmp); \ |
| } |
| |
| void SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister src3, |
| XMMRegister tmp) { |
| QFMA(ps) |
| } |
| |
| void SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister src3, |
| XMMRegister tmp) { |
| QFMS(ps) |
| } |
| |
| void SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister src3, |
| XMMRegister tmp) { |
| QFMA(pd); |
| } |
| |
| void SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1, |
| XMMRegister src2, XMMRegister src3, |
| XMMRegister tmp) { |
| QFMS(pd); |
| } |
| |
| #undef QFMOP |
| |
| } // namespace internal |
| } // namespace v8 |
| |
| #undef DCHECK_OPERAND_IS_NOT_REG |