[AMDGPU] gfx908 hazard recognizer

Differential Revision: https://reviews.llvm.org/D64593

llvm-svn: 365829
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 16436be..a23348e 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -46,7 +46,8 @@
   TRI(TII.getRegisterInfo()),
   ClauseUses(TRI.getNumRegUnits()),
   ClauseDefs(TRI.getNumRegUnits()) {
-  MaxLookAhead = 5;
+  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+  TSchedModel.init(&ST);
 }
 
 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
@@ -181,6 +182,12 @@
       checkReadM0Hazards(MI) > 0)
     return NoopHazard;
 
+  if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
+    return NoopHazard;
+
+  if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
+    return NoopHazard;
+
   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
     return NoopHazard;
 
@@ -286,6 +293,12 @@
   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
     return std::max(WaitStates, checkReadM0Hazards(MI));
 
+  if (SIInstrInfo::isMAI(*MI))
+    return std::max(WaitStates, checkMAIHazards(MI));
+
+  if (MI->mayLoad() || MI->mayStore())
+    return std::max(WaitStates, checkMAILdStHazards(MI));
+
   return WaitStates;
 }
 
@@ -1179,3 +1192,217 @@
   return FPAtomicToDenormModeWaitStates -
          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
 }
+
+int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
+  assert(SIInstrInfo::isMAI(*MI));
+
+  int WaitStatesNeeded = 0;
+  unsigned Opc = MI->getOpcode();
+
+  auto IsVALUFn = [] (MachineInstr *MI) {
+    return SIInstrInfo::isVALU(*MI);
+  };
+
+  if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
+    const int LegacyVALUWritesVGPRWaitStates = 2;
+    const int VALUWritesExecWaitStates = 4;
+    const int MaxWaitStates = 4;
+
+    int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+      getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    if (WaitStatesNeeded < MaxWaitStates) {
+      for (const MachineOperand &Use : MI->explicit_uses()) {
+        const int MaxWaitStates = 2;
+
+        if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+          continue;
+
+        int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
+          getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+        if (WaitStatesNeeded == MaxWaitStates)
+          break;
+      }
+    }
+  }
+
+  auto IsMFMAFn = [] (MachineInstr *MI) {
+    return SIInstrInfo::isMAI(*MI) &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
+  };
+
+  for (const MachineOperand &Op : MI->explicit_operands()) {
+    if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
+      continue;
+
+    if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
+      continue;
+
+    const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
+    const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
+    const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
+    const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
+    const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
+    const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
+    const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
+    const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
+    const int MaxWaitStates = 18;
+    unsigned Reg = Op.getReg();
+    unsigned HazardDefLatency = 0;
+
+    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
+                              (MachineInstr *MI) {
+      if (!IsMFMAFn(MI))
+        return false;
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (DstReg == Reg)
+        return false;
+      HazardDefLatency = std::max(HazardDefLatency,
+                                  TSchedModel.computeInstrLatency(MI));
+      return TRI.regsOverlap(DstReg, Reg);
+    };
+
+    int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
+                                                   MaxWaitStates);
+    int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
+    int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+    int OpNo = MI->getOperandNo(&Op);
+    if (OpNo == SrcCIdx) {
+      NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
+    } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
+      switch (HazardDefLatency) {
+      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
+               break;
+      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
+               break;
+      case 16: LLVM_FALLTHROUGH;
+      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
+               break;
+      }
+    } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+      switch (HazardDefLatency) {
+      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
+               break;
+      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
+               break;
+      case 16: LLVM_FALLTHROUGH;
+      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
+               break;
+      }
+    }
+
+    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    if (WaitStatesNeeded == MaxWaitStates)
+      return WaitStatesNeeded; // Early exit.
+
+    auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
+      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+        return false;
+      unsigned DstReg = MI->getOperand(0).getReg();
+      return TRI.regsOverlap(Reg, DstReg);
+    };
+
+    const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
+    const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
+    const int AccVGPRWriteAccVgprReadWaitStates = 3;
+    NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
+    if (OpNo == SrcCIdx)
+      NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
+    else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
+      NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
+
+    WaitStatesNeededForUse = NeedWaitStates -
+      getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    if (WaitStatesNeeded == MaxWaitStates)
+      return WaitStatesNeeded; // Early exit.
+  }
+
+  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+    const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
+    const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
+    const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
+    const int MaxWaitStates = 13;
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned HazardDefLatency = 0;
+
+    auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
+                         (MachineInstr *MI) {
+      if (!IsMFMAFn(MI))
+        return false;
+      unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+      HazardDefLatency = std::max(HazardDefLatency,
+                                  TSchedModel.computeInstrLatency(MI));
+      return TRI.regsOverlap(Reg, DstReg);
+    };
+
+    int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
+    int NeedWaitStates;
+    switch (HazardDefLatency) {
+    case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
+             break;
+    case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
+             break;
+    case 16: LLVM_FALLTHROUGH;
+    default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
+             break;
+    }
+
+    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
+  if (!ST.hasMAIInsts())
+    return 0;
+
+  int WaitStatesNeeded = 0;
+
+  auto IsAccVgprReadFn = [] (MachineInstr *MI) {
+    return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
+  };
+
+  for (const MachineOperand &Op : MI->explicit_uses()) {
+    if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
+      continue;
+
+    unsigned Reg = Op.getReg();
+
+    const int AccVgprReadLdStWaitStates = 2;
+    const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
+    const int MaxWaitStates = 2;
+
+    int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
+      getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    if (WaitStatesNeeded == MaxWaitStates)
+      return WaitStatesNeeded; // Early exit.
+
+    auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
+      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
+        return false;
+      auto IsVALUFn = [] (MachineInstr *MI) {
+        return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
+      };
+      return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
+             std::numeric_limits<int>::max();
+    };
+
+    WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
+      getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index cf914b3..6aa2e70 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include <list>
 
 namespace llvm {
@@ -46,6 +47,7 @@
   const GCNSubtarget &ST;
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
+  TargetSchedModel TSchedModel;
 
   /// RegUnits of uses in the current soft memory clause.
   BitVector ClauseUses;
@@ -92,6 +94,9 @@
   bool fixVcmpxExecWARHazard(MachineInstr *MI);
   bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
 
+  int checkMAIHazards(MachineInstr *MI);
+  int checkMAILdStHazards(MachineInstr *MI);
+
 public:
   GCNHazardRecognizer(const MachineFunction &MF);
   // We can only issue one instruction per cycle.
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
new file mode 100644
index 0000000..9f49f9f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -0,0 +1,457 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: valu_write_vgpr_mfma_read
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            valu_write_vgpr_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: valu_write_vgpr_accvgpr_write_read
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            valu_write_vgpr_accvgpr_write_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_write_agpr_mfma_read_same_agpr
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            mfma_write_agpr_mfma_read_same_agpr
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr1_agpr2_agpr3_agpr4 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_write_agpr_mfma_read_partial
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            mfma_write_agpr_mfma_read_partial
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $agpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_4x4_write_agpr_accvgpr_read
+# GCN:      V_MFMA_F32_4X4X1F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_READ_B32
+name:            mfma_4x4_write_agpr_accvgpr_read
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read
+# GCN:      V_MFMA_F32_16X16X1F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_READ_B32
+name:            mfma_16x16_write_agpr_accvgpr_read
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read
+# GCN:      V_MFMA_F32_32X32X2F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_READ_B32
+name:            mfma_32x32_write_agpr_accvgpr_read
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_4x4_write_agpr_accvgpr_write
+# GCN:      V_MFMA_F32_4X4X1F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_4x4_write_agpr_accvgpr_write
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_write
+# GCN:      V_MFMA_F32_16X16X1F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_16x16_write_agpr_accvgpr_write
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write
+# GCN:      V_MFMA_F32_32X32X2F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_32x32_write_agpr_accvgpr_write
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_4x4_read_srcc_accvgpr_write
+# GCN:      V_MFMA_F32_4X4X1F32
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_4x4_read_srcc_accvgpr_write
+body:             |
+  bb.0:
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_16x16_read_srcc_accvgpr_write
+# GCN:      V_MFMA_F32_16X16X1F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_16x16_read_srcc_accvgpr_write
+body:             |
+  bb.0:
+    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
+...
+---
+
+# GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write
+# GCN:      V_MFMA_F32_32X32X2F32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            mfma_32x32_read_srcc_accvgpr_write
+body:             |
+  bb.0:
+    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_read_write_vgpr_valu_read
+# GCN:      V_ACCVGPR_READ_B32
+# GCN-NEXT: V_ADD_F32
+name:            accvgpr_read_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec
+    $vgpr1 = V_ADD_F32_e32 0, killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_read_write_vgpr_mfma_read
+# GCN:      V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            accvgpr_read_write_vgpr_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr0, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_read_write_vgpr_accvgpr_write_read
+# GCN:      V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            accvgpr_read_write_vgpr_accvgpr_write_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_write_agpr_mfma_read_srcc
+# GCN:      V_ACCVGPR_WRITE_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            accvgpr_write_agpr_mfma_read_srcc
+body:             |
+  bb.0:
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr2, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_write_agpr_mfma_read_srca
+# GCN:      V_ACCVGPR_WRITE_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            accvgpr_write_agpr_mfma_read_srca
+body:             |
+  bb.0:
+    $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_write_agpr_mfma_read_srcb
+# GCN:      V_ACCVGPR_WRITE_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            accvgpr_write_agpr_mfma_read_srcb
+body:             |
+  bb.0:
+    $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr8, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_write_agpr_accvgpr_read
+# GCN:      V_ACCVGPR_WRITE_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_READ_B32
+name:            accvgpr_write_agpr_accvgpr_read
+body:             |
+  bb.0:
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
+    $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: vcmpx_write_exec_mfma
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            vcmpx_write_exec_mfma
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: vcmpx_write_exec_accvgpr_write
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32
+name:            vcmpx_write_exec_accvgpr_write
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
+...
+---
+
+# GCN-LABEL: name: accvgpr_read_write_vgpr_load
+# GCN:      V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_LOAD_DWORD
+name:            accvgpr_read_write_vgpr_load
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+...
+---
+
+# GCN-LABEL: name: accvgpr_read_write_vgpr_store
+# GCN:      V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: DS_WRITE_B32
+name:            accvgpr_read_write_vgpr_store
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+    DS_WRITE_B32 $vgpr0, $vgpr1, 0, 0, implicit $m0, implicit $exec
+...
+---
+
+# GCN-LABEL: name: valu_write_vgpr_accvgpr_read_load_no_dependency
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_ACCVGPR_READ_B32
+# GCN-NEXT: FLAT_LOAD_DWORD
+name:            valu_write_vgpr_accvgpr_read_load_no_dependency
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+...
+---
+
+# GCN-LABEL: name: valu_write_vgpr_accvgpr_read_load_1_and_3_depend
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_LOAD_DWORD
+name:            valu_write_vgpr_accvgpr_read_load_1_and_3_depend
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+...
+---
+
+# GCN-LABEL: name: valu_write_vgpr_accvgpr_read_load_2_and_3_depend
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_ACCVGPR_READ_B32
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_LOAD_DWORD
+name:            valu_write_vgpr_accvgpr_read_load_2_and_3_depend
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+...
+---