[turboshaft][wasm] Fix stale reference in array.fill and array.new

When calling to C++ for array.fill (if the length is more than 16
elements), Turboshaft passed the value in a 64 bit stack slot.
For reference values on 32 bits and on pointer-compressed builds, this
required converting a 32 bit tagged value into a 64 bit stack slot.
The emitted __ ChangeInt32ToInt64() makes the value untagged.
When performing GVN, these zero-extensions can be de-duplicated.
When spilled, it will be spilled in an untagged stack slot, e.g.
when calling into an allocation that can trigger a GC, making the
stack slot ignored by the GC and therefore stale afterwards.

This CL instead emits a stack slot that aligns with the input type
removing the need for any conversions (like zero-extensions) for all
types and furthermore also allowing us to mark the stack slot for the
C call as tagged. (These will not be GVN-ed and array.fill doesn't
allocate, so this isn't strictly required.)

Fixed: 381332096
Change-Id: If20a9a97ca8b75db4b7d533acb698efe859a94df
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/6070000
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Matthias Liedtke <mliedtke@chromium.org>
Cr-Commit-Position: refs/heads/main@{#97552}
diff --git a/src/wasm/turboshaft-graph-interface.cc b/src/wasm/turboshaft-graph-interface.cc
index 08b2eb1..17141ca 100644
--- a/src/wasm/turboshaft-graph-interface.cc
+++ b/src/wasm/turboshaft-graph-interface.cc
@@ -8001,7 +8001,7 @@
       constexpr uint32_t kArrayNewMinimumSizeForMemSet = 16;
       IF_NOT (__ Uint32LessThan(
                   length, __ Word32Constant(kArrayNewMinimumSizeForMemSet))) {
-        OpIndex stack_slot = StoreInInt64StackSlot(value, element_type);
+        OpIndex stack_slot = StoreInStackSlot(value, element_type);
         MachineType arg_types[]{
             MachineType::TaggedPointer(), MachineType::Uint32(),
             MachineType::Uint32(),        MachineType::Uint32(),
@@ -8027,31 +8027,25 @@
     BIND(done);
   }
 
-  V<WordPtr> StoreInInt64StackSlot(OpIndex value, wasm::ValueType type) {
-    OpIndex value_int64;
+  V<WordPtr> StoreInStackSlot(OpIndex value, wasm::ValueType type) {
+    // The input is unpacked.
+    ValueType input_type = type.Unpacked();
+
     switch (type.kind()) {
       case wasm::kI32:
       case wasm::kI8:
       case wasm::kI16:
-        value_int64 = __ ChangeInt32ToInt64(value);
-        break;
       case wasm::kI64:
-        value_int64 = value;
+      case wasm::kF32:
+      case wasm::kF64:
+      case wasm::kRefNull:
+      case wasm::kRef:
         break;
       case wasm::kS128:
         // We can only get here if {value} is the constant 0.
         DCHECK(__ output_graph().Get(value).Cast<Simd128ConstantOp>().IsZero());
-        value_int64 = __ Word64Constant(uint64_t{0});
-        break;
-      case wasm::kF32:
-        value_int64 = __ ChangeUint32ToUint64(__ BitcastFloat32ToWord32(value));
-        break;
-      case wasm::kF64:
-        value_int64 = __ BitcastFloat64ToWord64(value);
-        break;
-      case wasm::kRefNull:
-      case wasm::kRef:
-        value_int64 = kTaggedSize == 4 ? __ ChangeInt32ToInt64(value) : value;
+        value = __ Word64Constant(uint64_t{0});
+        input_type = kWasmI64;
         break;
       case wasm::kF16:
         UNIMPLEMENTED();
@@ -8062,10 +8056,15 @@
         UNREACHABLE();
     }
 
-    MemoryRepresentation int64_rep = MemoryRepresentation::Int64();
+    // Differently to values on the heap, stack slots are always uncompressed.
+    MemoryRepresentation memory_rep =
+        type.is_reference() ? MemoryRepresentation::UncompressedTaggedPointer()
+                            : MemoryRepresentation::FromMachineRepresentation(
+                                  input_type.machine_representation());
     V<WordPtr> stack_slot =
-        __ StackSlot(int64_rep.SizeInBytes(), int64_rep.SizeInBytes());
-    __ Store(stack_slot, value_int64, StoreOp::Kind::RawAligned(), int64_rep,
+        __ StackSlot(memory_rep.SizeInBytes(), memory_rep.SizeInBytes(),
+                     type.is_reference());
+    __ Store(stack_slot, value, StoreOp::Kind::RawAligned(), memory_rep,
              compiler::WriteBarrierKind::kNoWriteBarrier);
     return stack_slot;
   }
diff --git a/src/wasm/wasm-external-refs.cc b/src/wasm/wasm-external-refs.cc
index 86b9ef7..d497318 100644
--- a/src/wasm/wasm-external-refs.cc
+++ b/src/wasm/wasm-external-refs.cc
@@ -868,24 +868,23 @@
   ValueType type = ValueType::FromRawBitField(raw_type);
   int8_t* initial_element_address = reinterpret_cast<int8_t*>(
       ArrayElementAddress(raw_array, index, type.value_kind_size()));
-  // Stack pointers are only aligned to 4 bytes.
-  int64_t initial_value = base::ReadUnalignedValue<int64_t>(initial_value_addr);
   const int bytes_to_set = length * type.value_kind_size();
 
-  // If the initial value is zero, we memset the array.
-  if (type.is_numeric() && initial_value == 0) {
-    std::memset(initial_element_address, 0, bytes_to_set);
-    return;
-  }
-
   // We implement the general case by setting the first 8 bytes manually, then
   // filling the rest by exponentially growing {memcpy}s.
 
-  DCHECK_GE(static_cast<size_t>(bytes_to_set), sizeof(int64_t));
+  CHECK_GE(static_cast<size_t>(bytes_to_set), sizeof(int64_t));
 
   switch (type.kind()) {
     case kI64:
     case kF64: {
+      // Stack pointers are only aligned to 4 bytes.
+      int64_t initial_value =
+          base::ReadUnalignedValue<int64_t>(initial_value_addr);
+      if (initial_value == 0) {
+        std::memset(initial_element_address, 0, bytes_to_set);
+        return;
+      }
       // Array elements are only aligned to 4 bytes, therefore
       // `initial_element_address` may be misaligned as a 64-bit pointer.
       base::WriteUnalignedValue<int64_t>(
@@ -894,36 +893,59 @@
     }
     case kI32:
     case kF32: {
+      int32_t initial_value = *reinterpret_cast<int32_t*>(initial_value_addr);
+      if (initial_value == 0) {
+        std::memset(initial_element_address, 0, bytes_to_set);
+        return;
+      }
       int32_t* base = reinterpret_cast<int32_t*>(initial_element_address);
-      base[0] = base[1] = static_cast<int32_t>(initial_value);
+      base[0] = base[1] = initial_value;
       break;
     }
     case kF16:
     case kI16: {
+      // The array.fill input is an i32!
+      int16_t initial_value = *reinterpret_cast<int32_t*>(initial_value_addr);
+      if (initial_value == 0) {
+        std::memset(initial_element_address, 0, bytes_to_set);
+        return;
+      }
       int16_t* base = reinterpret_cast<int16_t*>(initial_element_address);
-      base[0] = base[1] = base[2] = base[3] =
-          static_cast<int16_t>(initial_value);
+      base[0] = base[1] = base[2] = base[3] = initial_value;
       break;
     }
     case kI8: {
+      // The array.fill input is an i32!
+      int8_t initial_value = *reinterpret_cast<int32_t*>(initial_value_addr);
+      if (initial_value == 0) {
+        std::memset(initial_element_address, 0, bytes_to_set);
+        return;
+      }
       int8_t* base = reinterpret_cast<int8_t*>(initial_element_address);
       for (size_t i = 0; i < sizeof(int64_t); i++) {
-        base[i] = static_cast<int8_t>(initial_value);
+        base[i] = initial_value;
       }
       break;
     }
     case kRefNull:
-    case kRef:
+    case kRef: {
+      intptr_t uncompressed_pointer =
+          base::ReadUnalignedValue<intptr_t>(initial_value_addr);
       if constexpr (kTaggedSize == 4) {
         int32_t* base = reinterpret_cast<int32_t*>(initial_element_address);
-        base[0] = base[1] = static_cast<int32_t>(initial_value);
+        base[0] = base[1] = static_cast<int32_t>(uncompressed_pointer);
       } else {
-        // We use WriteUnalignedValue; see above.
         base::WriteUnalignedValue(
-            reinterpret_cast<Address>(initial_element_address), initial_value);
+            reinterpret_cast<Address>(initial_element_address),
+            uncompressed_pointer);
       }
       break;
+    }
     case kS128:
+      // S128 can only be filled with zeros.
+      DCHECK_EQ(base::ReadUnalignedValue<int64_t>(initial_value_addr), 0);
+      std::memset(initial_element_address, 0, bytes_to_set);
+      return;
     case kRtt:
     case kVoid:
     case kTop:
diff --git a/test/mjsunit/wasm/array-fill-gc.js b/test/mjsunit/wasm/array-fill-gc.js
new file mode 100644
index 0000000..65f4761
--- /dev/null
+++ b/test/mjsunit/wasm/array-fill-gc.js
@@ -0,0 +1,74 @@
+// Copyright 2024 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --expose-gc
+
+d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
+
+(function TestRepeatedArrayFillWithGC() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  let struct = builder.addStruct([makeField(kWasmI64, true)]);
+  let array = builder.addArray(wasmRefNullType(struct), true);
+
+  let gcFunc = builder.addImport("m", "gc", kSig_v_v);
+
+  // This needs to be large enough, so that we call into C++ instead of running
+  // a loop in the generated code. (see kArrayNewMinimumSizeForMemSet)
+  let arrayLength = 50;
+
+  builder.addFunction("create", makeSig([kWasmI64], [wasmRefNullType(array)]))
+  .addLocals(wasmRefNullType(array), 1)
+  .addLocals(wasmRefNullType(struct), 1)
+  .addBody([
+    // Create the fill value (young generation).
+    kExprLocalGet, 0,
+    kGCPrefix, kExprStructNew, struct,
+    kExprLocalTee, 2,
+    // Create an array.
+    // As the size is larger than kArrayNewMinimumSizeForMemSet (16), this will
+    // call to C++ for initializing the elements. The fill value is passed as a
+    // pointer. There were two issues with that:
+    // 1) For a reference value on pointer-compressed builds, this emitted a
+    //    __ ChangeInt32ToInt64() operation causing the value to become
+    //    untagged.
+    // 2) This untagged value was then stored in an untagged stack slot. So any
+    //    GC would not visit this stack slot. Note that this bug doesn't seem
+    //    possible to be triggered as the stack slot is not GVN'ed (so the
+    //    second fill initializes a separate stack slot).
+    kExprI32Const, arrayLength,
+    kGCPrefix, kExprArrayNew, array,
+    kExprDrop,
+    // Trigger a gc.
+    kExprCallFunction, gcFunc,
+    // Create another array.
+    // The initialization emitted the same __ ChangeInt32ToInt64() for the fill
+    // value (the struct.new above). Due to GVN the compiler decides to reuse
+    // the instruction from above. However, due to the GC in between this
+    // "int64" now contains an outdated pointer pointing into the previous
+    // half-space for the young generation. (Note that anyways we should pass
+    // the decompressed pointer, zeroing out the upper half doesn't fit to our
+    // calling convention, it just happens to not be an issue as we only use the
+    // value to store it inside the tagged (compressed) slots inside the array.)
+    kExprLocalGet, 2,
+    kExprI32Const, arrayLength,
+    kGCPrefix, kExprArrayNew, array,
+  ]).exportFunc();
+
+  builder.addFunction("get",
+    makeSig([wasmRefNullType(array), kWasmI32], [kWasmI64]))
+  .addBody([
+    kExprLocalGet, 0,
+    kExprLocalGet, 1,
+    kGCPrefix, kExprArrayGet, array,
+    kGCPrefix, kExprStructGet, struct, 0,
+  ]).exportFunc();
+
+  let wasm = builder.instantiate({m: {gc}}).exports;
+  let arr = wasm.create(43n);
+  for (let i = 0; i < arrayLength; ++i) {
+    assertEquals(43n, wasm.get(arr, i));
+  }
+})();