[wasm-simd][liftoff][arm][arm64] Implement bitmask

Implement i8x16 i16x8 i32x4 bitmask for arm and arm64.

The instruction sequence is the same as TurboFan, we have some special
handling for getting the second temporary Q register, reuse src if
possible.

Bug: v8:9909,v8:10308
Change-Id: I1c6fe0d076f0e14d05c4cc532e4d976f4ebcce30
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2222608
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68107}
diff --git a/src/wasm/baseline/arm/liftoff-assembler-arm.h b/src/wasm/baseline/arm/liftoff-assembler-arm.h
index 7a9ea4e..734b449 100644
--- a/src/wasm/baseline/arm/liftoff-assembler-arm.h
+++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h
@@ -2575,7 +2575,26 @@
 
 void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i32x4_bitmask");
+  UseScratchRegisterScope temps(this);
+  Simd128Register tmp = liftoff::GetSimd128Register(src);
+  Simd128Register mask = temps.AcquireQ();
+
+  if (cache_state()->is_used(src)) {
+    // We only have 1 scratch Q register, so try and reuse src.
+    LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+    LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+    mask = liftoff::GetSimd128Register(unused_pair);
+  }
+
+  vshr(NeonS32, tmp, liftoff::GetSimd128Register(src), 31);
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  vmov(mask.low(), Double((uint64_t)0x0000'0002'0000'0001));
+  vmov(mask.high(), Double((uint64_t)0x0000'0008'0000'0004));
+  vand(tmp, mask, tmp);
+  vpadd(Neon32, tmp.low(), tmp.low(), tmp.high());
+  vpadd(Neon32, tmp.low(), tmp.low(), kDoubleRegZero);
+  VmovLow(dst.gp(), tmp.low());
 }
 
 void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -2689,7 +2708,27 @@
 
 void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i16x8_bitmask");
+  UseScratchRegisterScope temps(this);
+  Simd128Register tmp = liftoff::GetSimd128Register(src);
+  Simd128Register mask = temps.AcquireQ();
+
+  if (cache_state()->is_used(src)) {
+    // We only have 1 scratch Q register, so try and reuse src.
+    LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+    LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+    mask = liftoff::GetSimd128Register(unused_pair);
+  }
+
+  vshr(NeonS16, tmp, liftoff::GetSimd128Register(src), 15);
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  vmov(mask.low(), Double((uint64_t)0x0008'0004'0002'0001));
+  vmov(mask.high(), Double((uint64_t)0x0080'0040'0020'0010));
+  vand(tmp, mask, tmp);
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+  vmov(NeonU16, dst.gp(), tmp.low(), 0);
 }
 
 void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -2876,7 +2915,29 @@
 
 void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i8x16_bitmask");
+  UseScratchRegisterScope temps(this);
+  Simd128Register tmp = liftoff::GetSimd128Register(src);
+  Simd128Register mask = temps.AcquireQ();
+
+  if (cache_state()->is_used(src)) {
+    // We only have 1 scratch Q register, so try and reuse src.
+    LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
+    LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
+    mask = liftoff::GetSimd128Register(unused_pair);
+  }
+
+  vshr(NeonS8, tmp, liftoff::GetSimd128Register(src), 7);
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  vmov(mask.low(), Double((uint64_t)0x8040'2010'0804'0201));
+  vmov(mask.high(), Double((uint64_t)0x8040'2010'0804'0201));
+  vand(tmp, mask, tmp);
+  vext(mask, tmp, tmp, 8);
+  vzip(Neon8, mask, tmp);
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+  vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+  vmov(NeonU16, dst.gp(), tmp.low(), 0);
 }
 
 void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
diff --git a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
index 24e5da8..9120c49 100644
--- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
+++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
@@ -1515,7 +1515,17 @@
 
 void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i32x4_bitmask");
+  UseScratchRegisterScope temps(this);
+  VRegister tmp = temps.AcquireQ();
+  VRegister mask = temps.AcquireQ();
+
+  Sshr(tmp.V4S(), src.fp().V4S(), 31);
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  Movi(mask.V2D(), 0x0000'0008'0000'0004, 0x0000'0002'0000'0001);
+  And(tmp.V16B(), mask.V16B(), tmp.V16B());
+  Addv(tmp.S(), tmp.V4S());
+  Mov(dst.gp().W(), tmp.V4S(), 0);
 }
 
 void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -1641,7 +1651,17 @@
 
 void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i16x8_bitmask");
+  UseScratchRegisterScope temps(this);
+  VRegister tmp = temps.AcquireQ();
+  VRegister mask = temps.AcquireQ();
+
+  Sshr(tmp.V8H(), src.fp().V8H(), 15);
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  Movi(mask.V2D(), 0x0080'0040'0020'0010, 0x0008'0004'0002'0001);
+  And(tmp.V16B(), mask.V16B(), tmp.V16B());
+  Addv(tmp.H(), tmp.V8H());
+  Mov(dst.gp().W(), tmp.V8H(), 0);
 }
 
 void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
@@ -1791,7 +1811,19 @@
 
 void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
-  bailout(kSimd, "i8x16_bitmask");
+  UseScratchRegisterScope temps(this);
+  VRegister tmp = temps.AcquireQ();
+  VRegister mask = temps.AcquireQ();
+
+  // Set i-th bit of each lane i. When AND with tmp, the lanes that
+  // are signed will have i-th bit set, unsigned will be 0.
+  Sshr(tmp.V16B(), src.fp().V16B(), 7);
+  Movi(mask.V2D(), 0x8040'2010'0804'0201);
+  And(tmp.V16B(), mask.V16B(), tmp.V16B());
+  Ext(mask.V16B(), tmp.V16B(), tmp.V16B(), 8);
+  Zip1(tmp.V16B(), tmp.V16B(), mask.V16B());
+  Addv(tmp.H(), tmp.V8H());
+  Mov(dst.gp().W(), tmp.V8H(), 0);
 }
 
 void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,