PPC: [wasm-simd] Add support for Simd128 moves and swaps

Change-Id: Ie2668026c5b55af8813f159277bdbc83116c1a00
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2336776
Reviewed-by: Junliang Yan <jyan@ca.ibm.com>
Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com>
Cr-Commit-Position: refs/heads/master@{#69227}
diff --git a/src/codegen/ppc/macro-assembler-ppc.cc b/src/codegen/ppc/macro-assembler-ppc.cc
index 86939256..c99977a 100644
--- a/src/codegen/ppc/macro-assembler-ppc.cc
+++ b/src/codegen/ppc/macro-assembler-ppc.cc
@@ -2828,6 +2828,24 @@
   }
 }
 
+void TurboAssembler::LoadSimd128(Simd128Register dst, const MemOperand& mem,
+                                 Register ScratchReg,
+                                 Simd128Register ScratchDoubleReg) {
+  // lvx needs the stack to be 16 byte aligned.
+  // We first use lxvd/stxvd to copy the content on an aligned address. lxvd
+  // itself reverses the lanes so it cannot be used as is.
+  lxvd(ScratchDoubleReg, mem);
+  mr(ScratchReg, sp);
+  ClearRightImm(
+      sp, sp,
+      Operand(base::bits::WhichPowerOfTwo(16)));  // equivalent to &= -16
+  addi(sp, sp, Operand(-16));
+  stxvd(kScratchDoubleReg, MemOperand(r0, sp));
+  // Load it with correct lane ordering.
+  lvx(dst, MemOperand(r0, sp));
+  mr(sp, ScratchReg);
+}
+
 void TurboAssembler::StoreDouble(DoubleRegister src, const MemOperand& mem,
                                  Register scratch) {
   Register base = mem.ra();
@@ -2880,6 +2898,23 @@
   }
 }
 
+void TurboAssembler::StoreSimd128(Simd128Register src, const MemOperand& mem,
+                                  Register ScratchReg,
+                                  Simd128Register ScratchDoubleReg) {
+  // stvx needs the stack to be 16 byte aligned.
+  // We use lxvd/stxvd to store the content on an aligned address. stxvd
+  // itself reverses the lanes so it cannot be used as is.
+  mr(ScratchReg, sp);
+  ClearRightImm(
+      sp, sp,
+      Operand(base::bits::WhichPowerOfTwo(16)));  // equivalent to &= -16
+  addi(sp, sp, Operand(-16));
+  stvx(src, MemOperand(r0, sp));
+  lxvd(ScratchDoubleReg, MemOperand(r0, sp));
+  mr(sp, ScratchReg);
+  stxvd(ScratchDoubleReg, mem);
+}
+
 Register GetRegisterThatIsNotOneOf(Register reg1, Register reg2, Register reg3,
                                    Register reg4, Register reg5,
                                    Register reg6) {
@@ -3006,6 +3041,53 @@
   StoreDouble(scratch_1, src, r0);
 }
 
+void TurboAssembler::SwapSimd128(Simd128Register src, Simd128Register dst,
+                                 Simd128Register scratch) {
+  if (src == dst) return;
+  vor(scratch, src, src);
+  vor(src, dst, dst);
+  vor(dst, scratch, scratch);
+}
+
+void TurboAssembler::SwapSimd128(Simd128Register src, MemOperand dst,
+                                 Simd128Register scratch) {
+  DCHECK(!AreAliased(src, scratch));
+  // push d0, to be used as scratch
+  addi(sp, sp, Operand(-kSimd128Size));
+  StoreSimd128(d0, MemOperand(r0, sp), r0, scratch);
+  mov(ip, Operand(dst.offset()));
+  LoadSimd128(d0, MemOperand(dst.ra(), ip), r0, scratch);
+  StoreSimd128(src, MemOperand(dst.ra(), ip), r0, scratch);
+  vor(src, d0, d0);
+  // restore d0
+  LoadSimd128(d0, MemOperand(r0, sp), ip, scratch);
+  addi(sp, sp, Operand(kSimd128Size));
+}
+
+void TurboAssembler::SwapSimd128(MemOperand src, MemOperand dst,
+                                 Simd128Register scratch) {
+  // push d0 and d1, to be used as scratch
+  addi(sp, sp, Operand(2 * -kSimd128Size));
+  StoreSimd128(d0, MemOperand(r0, sp), ip, scratch);
+  li(ip, Operand(kSimd128Size));
+  StoreSimd128(d1, MemOperand(ip, sp), r0, scratch);
+
+  mov(ip, Operand(src.offset()));
+  LoadSimd128(d0, MemOperand(src.ra(), ip), r0, scratch);
+  mov(ip, Operand(dst.offset()));
+  LoadSimd128(d1, MemOperand(dst.ra(), ip), r0, scratch);
+
+  StoreSimd128(d0, MemOperand(dst.ra(), ip), r0, scratch);
+  mov(ip, Operand(src.offset()));
+  StoreSimd128(d1, MemOperand(src.ra(), ip), r0, scratch);
+
+  // restore d0 and d1
+  LoadSimd128(d0, MemOperand(r0, sp), ip, scratch);
+  li(ip, Operand(kSimd128Size));
+  LoadSimd128(d1, MemOperand(ip, sp), r0, scratch);
+  addi(sp, sp, Operand(2 * kSimd128Size));
+}
+
 void TurboAssembler::ResetSpeculationPoisonRegister() {
   mov(kSpeculationPoisonRegister, Operand(-1));
 }
diff --git a/src/codegen/ppc/macro-assembler-ppc.h b/src/codegen/ppc/macro-assembler-ppc.h
index cea89a4..0e9c764 100644
--- a/src/codegen/ppc/macro-assembler-ppc.h
+++ b/src/codegen/ppc/macro-assembler-ppc.h
@@ -153,6 +153,8 @@
   void LoadFloat32(DoubleRegister dst, const MemOperand& mem,
                    Register scratch = no_reg);
   void LoadDoubleLiteral(DoubleRegister result, Double value, Register scratch);
+  void LoadSimd128(Simd128Register dst, const MemOperand& mem,
+                   Register ScratchReg, Simd128Register ScratchDoubleReg);
 
   // load a literal signed int value <value> to GPR <dst>
   void LoadIntLiteral(Register dst, int value);
@@ -175,6 +177,8 @@
                    Register scratch = no_reg);
   void StoreSingleU(DoubleRegister src, const MemOperand& mem,
                     Register scratch = no_reg);
+  void StoreSimd128(Simd128Register src, const MemOperand& mem,
+                    Register ScratchReg, Simd128Register ScratchDoubleReg);
 
   void Cmpi(Register src1, const Operand& src2, Register scratch,
             CRegister cr = cr7);
@@ -326,6 +330,11 @@
   void SwapDouble(DoubleRegister src, MemOperand dst, DoubleRegister scratch);
   void SwapDouble(MemOperand src, MemOperand dst, DoubleRegister scratch_0,
                   DoubleRegister scratch_1);
+  void SwapSimd128(Simd128Register src, Simd128Register dst,
+                   Simd128Register scratch);
+  void SwapSimd128(Simd128Register src, MemOperand dst,
+                   Simd128Register scratch);
+  void SwapSimd128(MemOperand src, MemOperand dst, Simd128Register scratch);
 
   // Before calling a C-function from generated code, align arguments on stack.
   // After aligning the frame, non-register arguments must be stored in
diff --git a/src/compiler/backend/ppc/code-generator-ppc.cc b/src/compiler/backend/ppc/code-generator-ppc.cc
index 9b86bd0..b0b63ed 100644
--- a/src/compiler/backend/ppc/code-generator-ppc.cc
+++ b/src/compiler/backend/ppc/code-generator-ppc.cc
@@ -108,7 +108,8 @@
     return MemoryOperand(mode, &first_index);
   }
 
-  MemOperand ToMemOperand(InstructionOperand* op) const {
+  MemOperand ToMemOperand(InstructionOperand* op,
+                          AddressingMode mode = kMode_None) const {
     DCHECK_NOT_NULL(op);
     DCHECK(op->IsStackSlot() || op->IsFPStackSlot());
     return SlotToMemOperand(AllocatedOperand::cast(op)->index());
@@ -1739,16 +1740,29 @@
     case kPPC_Push:
       if (instr->InputAt(0)->IsFPRegister()) {
         LocationOperand* op = LocationOperand::cast(instr->InputAt(0));
-        if (op->representation() == MachineRepresentation::kFloat64) {
-          __ StoreDoubleU(i.InputDoubleRegister(0),
-                          MemOperand(sp, -kDoubleSize), r0);
-          frame_access_state()->IncreaseSPDelta(kDoubleSize /
-                                                kSystemPointerSize);
-        } else {
-          DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
-          __ StoreSingleU(i.InputDoubleRegister(0),
-                          MemOperand(sp, -kSystemPointerSize), r0);
-          frame_access_state()->IncreaseSPDelta(1);
+        switch (op->representation()) {
+          case MachineRepresentation::kFloat32:
+            __ StoreSingleU(i.InputDoubleRegister(0),
+                            MemOperand(sp, -kSystemPointerSize), r0);
+            frame_access_state()->IncreaseSPDelta(1);
+            break;
+          case MachineRepresentation::kFloat64:
+            __ StoreDoubleU(i.InputDoubleRegister(0),
+                            MemOperand(sp, -kDoubleSize), r0);
+            frame_access_state()->IncreaseSPDelta(kDoubleSize /
+                                                  kSystemPointerSize);
+            break;
+          case MachineRepresentation::kSimd128: {
+            __ addi(sp, sp, Operand(-kSimd128Size));
+            __ StoreSimd128(i.InputDoubleRegister(0), MemOperand(r0, sp), r0,
+                            kScratchDoubleReg);
+            frame_access_state()->IncreaseSPDelta(kSimd128Size /
+                                                  kSystemPointerSize);
+            break;
+          }
+          default:
+            UNREACHABLE();
+            break;
         }
       } else {
         __ StorePU(i.InputRegister(0), MemOperand(sp, -kSystemPointerSize), r0);
@@ -1781,10 +1795,14 @@
         if (op->representation() == MachineRepresentation::kFloat64) {
           __ StoreDouble(i.InputDoubleRegister(0),
                          MemOperand(sp, slot * kSystemPointerSize), r0);
-        } else {
-          DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
+        } else if (op->representation() == MachineRepresentation::kFloat32) {
           __ StoreSingle(i.InputDoubleRegister(0),
                          MemOperand(sp, slot * kSystemPointerSize), r0);
+        } else {
+          DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
+          __ mov(ip, Operand(slot * kSystemPointerSize));
+          __ StoreSimd128(i.InputDoubleRegister(0), MemOperand(ip, sp), r0,
+                          kScratchDoubleReg);
         }
       } else {
         __ StoreP(i.InputRegister(0), MemOperand(sp, slot * kSystemPointerSize),
@@ -2044,20 +2062,7 @@
       bool is_atomic = i.InputInt32(2);
       // lvx only supports MRR.
       DCHECK_EQ(mode, kMode_MRR);
-      // lvx needs the stack to be 16 byte aligned.
-      // We first use lxvd/stxvd to copy the content on an aligned address. lxvd
-      // itself reverses the lanes so it cannot be used as is.
-      __ lxvd(kScratchDoubleReg, operand);
-      __ mr(kScratchReg, sp);
-      __ ClearRightImm(
-          sp, sp,
-          Operand(base::bits::WhichPowerOfTwo(16)));  // equivalent to &= -16
-      __ addi(sp, sp, Operand(-16));
-      __ li(r0, Operand(0));
-      __ stxvd(kScratchDoubleReg, MemOperand(sp, r0));
-      // Load it with correct lane ordering.
-      __ lvx(result, MemOperand(sp, r0));
-      __ mr(sp, kScratchReg);
+      __ LoadSimd128(result, operand, r0, kScratchDoubleReg);
       if (is_atomic) __ lwsync();
       DCHECK_EQ(LeaveRC, i.OutputRCBit());
       break;
@@ -2091,19 +2096,7 @@
       if (is_atomic) __ lwsync();
       // stvx only supports MRR.
       DCHECK_EQ(mode, kMode_MRR);
-      // stvx needs the stack to be 16 byte aligned.
-      // We use lxvd/stxvd to store the content on an aligned address. stxvd
-      // itself reverses the lanes so it cannot be used as is.
-      __ mr(kScratchReg, sp);
-      __ ClearRightImm(
-          sp, sp,
-          Operand(base::bits::WhichPowerOfTwo(16)));  // equivalent to &= -16
-      __ addi(sp, sp, Operand(-16));
-      __ li(r0, Operand(0));
-      __ stvx(value, MemOperand(sp, r0));
-      __ lxvd(kScratchDoubleReg, MemOperand(sp, r0));
-      __ stxvd(kScratchDoubleReg, operand);
-      __ mr(sp, kScratchReg);
+      __ StoreSimd128(value, operand, r0, kScratchDoubleReg);
       if (is_atomic) __ sync();
       DCHECK_EQ(LeaveRC, i.OutputRCBit());
       break;
@@ -3877,17 +3870,31 @@
       }
     }
   } else if (source->IsFPRegister()) {
-    DoubleRegister src = g.ToDoubleRegister(source);
-    if (destination->IsFPRegister()) {
-      DoubleRegister dst = g.ToDoubleRegister(destination);
-      __ Move(dst, src);
-    } else {
-      DCHECK(destination->IsFPStackSlot());
-      LocationOperand* op = LocationOperand::cast(source);
-      if (op->representation() == MachineRepresentation::kFloat64) {
-        __ StoreDouble(src, g.ToMemOperand(destination), r0);
+    MachineRepresentation rep = LocationOperand::cast(source)->representation();
+    if (rep == MachineRepresentation::kSimd128) {
+      if (destination->IsSimd128Register()) {
+        __ vor(g.ToSimd128Register(destination), g.ToSimd128Register(source),
+               g.ToSimd128Register(source));
       } else {
-        __ StoreSingle(src, g.ToMemOperand(destination), r0);
+        DCHECK(destination->IsSimd128StackSlot());
+        MemOperand dst = g.ToMemOperand(destination);
+        __ mov(ip, Operand(dst.offset()));
+        __ StoreSimd128(g.ToSimd128Register(source), MemOperand(dst.ra(), ip),
+                        r0, kScratchDoubleReg);
+      }
+    } else {
+      DoubleRegister src = g.ToDoubleRegister(source);
+      if (destination->IsFPRegister()) {
+        DoubleRegister dst = g.ToDoubleRegister(destination);
+        __ Move(dst, src);
+      } else {
+        DCHECK(destination->IsFPStackSlot());
+        LocationOperand* op = LocationOperand::cast(source);
+        if (op->representation() == MachineRepresentation::kFloat64) {
+          __ StoreDouble(src, g.ToMemOperand(destination), r0);
+        } else {
+          __ StoreSingle(src, g.ToMemOperand(destination), r0);
+        }
       }
     }
   } else if (source->IsFPStackSlot()) {
@@ -3897,8 +3904,14 @@
       LocationOperand* op = LocationOperand::cast(source);
       if (op->representation() == MachineRepresentation::kFloat64) {
         __ LoadDouble(g.ToDoubleRegister(destination), src, r0);
-      } else {
+      } else if (op->representation() == MachineRepresentation::kFloat32) {
         __ LoadSingle(g.ToDoubleRegister(destination), src, r0);
+      } else {
+        DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
+        MemOperand src = g.ToMemOperand(source);
+        __ mov(ip, Operand(src.offset()));
+        __ LoadSimd128(g.ToSimd128Register(destination),
+                       MemOperand(src.ra(), ip), r0, kScratchDoubleReg);
       }
     } else {
       LocationOperand* op = LocationOperand::cast(source);
@@ -3906,9 +3919,23 @@
       if (op->representation() == MachineRepresentation::kFloat64) {
         __ LoadDouble(temp, src, r0);
         __ StoreDouble(temp, g.ToMemOperand(destination), r0);
-      } else {
+      } else if (op->representation() == MachineRepresentation::kFloat32) {
         __ LoadSingle(temp, src, r0);
         __ StoreSingle(temp, g.ToMemOperand(destination), r0);
+      } else {
+        DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
+        // push d0, to be used as scratch
+        __ addi(sp, sp, Operand(-kSimd128Size));
+        __ StoreSimd128(d0, MemOperand(r0, sp), r0, kScratchDoubleReg);
+        MemOperand src = g.ToMemOperand(source);
+        MemOperand dst = g.ToMemOperand(destination);
+        __ mov(ip, Operand(src.offset()));
+        __ LoadSimd128(d0, MemOperand(src.ra(), ip), r0, kScratchDoubleReg);
+        __ mov(ip, Operand(dst.offset()));
+        __ StoreSimd128(d0, MemOperand(dst.ra(), ip), r0, kScratchDoubleReg);
+        // restore d0
+        __ LoadSimd128(d0, MemOperand(r0, sp), ip, kScratchDoubleReg);
+        __ addi(sp, sp, Operand(kSimd128Size));
       }
     }
   } else {
@@ -3963,8 +3990,20 @@
     DCHECK(destination->IsDoubleStackSlot());
     __ SwapDouble(g.ToMemOperand(source), g.ToMemOperand(destination),
                   kScratchDoubleReg, d0);
+
   } else if (source->IsSimd128Register()) {
-    UNREACHABLE();
+    Simd128Register src = g.ToSimd128Register(source);
+    if (destination->IsSimd128Register()) {
+      __ SwapSimd128(src, g.ToSimd128Register(destination), kScratchDoubleReg);
+    } else {
+      DCHECK(destination->IsSimd128StackSlot());
+      __ SwapSimd128(src, g.ToMemOperand(destination), kScratchDoubleReg);
+    }
+  } else if (source->IsSimd128StackSlot()) {
+    DCHECK(destination->IsSimd128StackSlot());
+    __ SwapSimd128(g.ToMemOperand(source), g.ToMemOperand(destination),
+                   kScratchDoubleReg);
+
   } else {
     UNREACHABLE();
   }