[wasm-simd][liftoff][arm][arm64] Implement bitmask
Implement i8x16 i16x8 i32x4 bitmask for arm and arm64.
The instruction sequence is the same as TurboFan, we have some special
handling for getting the second temporary Q register, reuse src if
possible.
Bug: v8:9909,v8:10308
Change-Id: I1c6fe0d076f0e14d05c4cc532e4d976f4ebcce30
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2222608
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68107}
diff --git a/src/wasm/baseline/arm/liftoff-assembler-arm.h b/src/wasm/baseline/arm/liftoff-assembler-arm.h
index 7a9ea4e..734b449 100644
--- a/src/wasm/baseline/arm/liftoff-assembler-arm.h
+++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h
@@ -2575,7 +2575,26 @@
void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i32x4_bitmask");
+ UseScratchRegisterScope temps(this);
+ Simd128Register tmp = liftoff::GetSimd128Register(src);
+ Simd128Register mask = temps.AcquireQ();
+
+ if (cache_state()->is_used(src)) {
+ // We only have 1 scratch Q register, so try and reuse src.
+ LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+ LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+ mask = liftoff::GetSimd128Register(unused_pair);
+ }
+
+ vshr(NeonS32, tmp, liftoff::GetSimd128Register(src), 31);
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ vmov(mask.low(), Double((uint64_t)0x0000'0002'0000'0001));
+ vmov(mask.high(), Double((uint64_t)0x0000'0008'0000'0004));
+ vand(tmp, mask, tmp);
+ vpadd(Neon32, tmp.low(), tmp.low(), tmp.high());
+ vpadd(Neon32, tmp.low(), tmp.low(), kDoubleRegZero);
+ VmovLow(dst.gp(), tmp.low());
}
void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -2689,7 +2708,27 @@
void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i16x8_bitmask");
+ UseScratchRegisterScope temps(this);
+ Simd128Register tmp = liftoff::GetSimd128Register(src);
+ Simd128Register mask = temps.AcquireQ();
+
+ if (cache_state()->is_used(src)) {
+ // We only have 1 scratch Q register, so try and reuse src.
+ LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+ LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+ mask = liftoff::GetSimd128Register(unused_pair);
+ }
+
+ vshr(NeonS16, tmp, liftoff::GetSimd128Register(src), 15);
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ vmov(mask.low(), Double((uint64_t)0x0008'0004'0002'0001));
+ vmov(mask.high(), Double((uint64_t)0x0080'0040'0020'0010));
+ vand(tmp, mask, tmp);
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+ vmov(NeonU16, dst.gp(), tmp.low(), 0);
}
void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -2876,7 +2915,29 @@
void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i8x16_bitmask");
+ UseScratchRegisterScope temps(this);
+ Simd128Register tmp = liftoff::GetSimd128Register(src);
+ Simd128Register mask = temps.AcquireQ();
+
+ if (cache_state()->is_used(src)) {
+ // We only have 1 scratch Q register, so try and reuse src.
+ LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+ LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+ mask = liftoff::GetSimd128Register(unused_pair);
+ }
+
+ vshr(NeonS8, tmp, liftoff::GetSimd128Register(src), 7);
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ vmov(mask.low(), Double((uint64_t)0x8040'2010'0804'0201));
+ vmov(mask.high(), Double((uint64_t)0x8040'2010'0804'0201));
+ vand(tmp, mask, tmp);
+ vext(mask, tmp, tmp, 8);
+ vzip(Neon8, mask, tmp);
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+ vmov(NeonU16, dst.gp(), tmp.low(), 0);
}
void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
diff --git a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
index 24e5da8..9120c49 100644
--- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
+++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
@@ -1515,7 +1515,17 @@
void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i32x4_bitmask");
+ UseScratchRegisterScope temps(this);
+ VRegister tmp = temps.AcquireQ();
+ VRegister mask = temps.AcquireQ();
+
+ Sshr(tmp.V4S(), src.fp().V4S(), 31);
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ Movi(mask.V2D(), 0x0000'0008'0000'0004, 0x0000'0002'0000'0001);
+ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+ Addv(tmp.S(), tmp.V4S());
+ Mov(dst.gp().W(), tmp.V4S(), 0);
}
void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -1641,7 +1651,17 @@
void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i16x8_bitmask");
+ UseScratchRegisterScope temps(this);
+ VRegister tmp = temps.AcquireQ();
+ VRegister mask = temps.AcquireQ();
+
+ Sshr(tmp.V8H(), src.fp().V8H(), 15);
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ Movi(mask.V2D(), 0x0080'0040'0020'0010, 0x0008'0004'0002'0001);
+ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+ Addv(tmp.H(), tmp.V8H());
+ Mov(dst.gp().W(), tmp.V8H(), 0);
}
void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -1791,7 +1811,19 @@
void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
LiftoffRegister src) {
- bailout(kSimd, "i8x16_bitmask");
+ UseScratchRegisterScope temps(this);
+ VRegister tmp = temps.AcquireQ();
+ VRegister mask = temps.AcquireQ();
+
+ // Set i-th bit of each lane i. When AND with tmp, the lanes that
+ // are signed will have i-th bit set, unsigned will be 0.
+ Sshr(tmp.V16B(), src.fp().V16B(), 7);
+ Movi(mask.V2D(), 0x8040'2010'0804'0201);
+ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+ Ext(mask.V16B(), tmp.V16B(), tmp.V16B(), 8);
+ Zip1(tmp.V16B(), tmp.V16B(), mask.V16B());
+ Addv(tmp.H(), tmp.V8H());
+ Mov(dst.gp().W(), tmp.V8H(), 0);
}
void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,