[GlobalISel][AArch64] Improve register bank mappings for G_SELECT

The fcsel and csel instructions differ in only the register banks they work on.

So, they're entirely interchangeable otherwise.

With this in mind, this does two things:

- Teach AArch64RegisterBankInfo to consider the inputs to G_SELECT as well as
  the outputs.
- Teach it to choose the best register bank mapping based off the constraints
  of the inputs and outputs.

The "best" in this case means the one that requires the smallest number of
copies to properly emit a fcsel/csel.

For example, if the inputs are all already going to be on FPRs, we should
emit a fcsel, even if the output is a GPR. This costs one copy to produce the
result, but saves us from copying the inputs into GPRs.

Also update the regbank-select.mir to check that we end up with the right
select instruction.

Differential Revision: https://reviews.llvm.org/D62267

llvm-svn: 361665
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 7fdcde44..6993436 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -679,15 +679,58 @@
     // If the destination is FPR, preserve that.
     if (OpRegBankIdx[0] != PMI_FirstGPR)
       break;
+
+    // If we're taking in vectors, we have no choice but to put everything on
+    // FPRs.
     LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
-    if (SrcTy.isVector() ||
-        any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
-               [&](MachineInstr &MI) { return HasFPConstraints(MI); })) {
-      // Set the register bank of every operand to FPR.
-      for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
-           Idx < NumOperands; ++Idx)
+    if (SrcTy.isVector()) {
+      for (unsigned Idx = 0; Idx < 4; ++Idx)
         OpRegBankIdx[Idx] = PMI_FirstFPR;
+      break;
     }
+
+    // Try to minimize the number of copies. If we have more floating point
+    // constrained values than not, then we'll put everything on FPR. Otherwise,
+    // everything has to be on GPR.
+    unsigned NumFP = 0;
+
+    // Check if the uses of the result always produce floating point values.
+    //
+    // For example:
+    //
+    // %z = G_SELECT %cond %x %y
+    // fpr = G_FOO %z ...
+    if (any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+               [&](MachineInstr &MI) { return HasFPConstraints(MI); }))
+      ++NumFP;
+
+    // Check if the defs of the source values always produce floating point
+    // values.
+    //
+    // For example:
+    //
+    // %x = G_SOMETHING_ALWAYS_FLOAT %a ...
+    // %z = G_SELECT %cond %x %y
+    //
+    // Also check whether or not the sources have already been decided to be
+    // FPR. Keep track of this.
+    //
+    // This doesn't check the condition, since it's just whatever is in NZCV.
+    // This isn't passed explicitly in a register to fcsel/csel.
+    for (unsigned Idx = 2; Idx < 4; ++Idx) {
+      unsigned VReg = MI.getOperand(Idx).getReg();
+      MachineInstr *DefMI = MRI.getVRegDef(VReg);
+      if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
+          HasFPConstraints(*DefMI))
+        ++NumFP;
+    }
+
+    // If we have more FP constraints than not, then move everything over to
+    // FPR.
+    if (NumFP >= 2)
+      for (unsigned Idx = 0; Idx < 4; ++Idx)
+        OpRegBankIdx[Idx] = PMI_FirstFPR;
+
     break;
   }
   case TargetOpcode::G_UNMERGE_VALUES: {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-select.mir
index 97b5434..99c6916 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-select.mir
@@ -58,3 +58,133 @@
     %4:_(s64) = G_SELECT %0(s1), %1, %2
     $d0 = COPY %4(s64)
     RET_ReallyLR implicit $d0
+
+...
+---
+name:            two_fpr_inputs_gpr_output
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $d1, $w0
+    ; CHECK-LABEL: name: two_fpr_inputs_gpr_output
+    ; CHECK: liveins: $d0, $d1, $w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr(s64) = COPY $d1
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr(s1) = COPY [[TRUNC]](s1)
+    ; CHECK: [[SELECT:%[0-9]+]]:fpr(s64) = G_SELECT [[COPY3]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK: $x0 = COPY [[SELECT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+
+    ; Verify that the G_SELECT only has FPRs.
+    ; The only difference between fcsel and csel are the register banks. So,
+    ; if we have two FPR inputs and a GPR output, we should do a floating point
+    ; select anyway. This will cost one copy for the output, but that's less
+    ; than doing two to put the inputs on GPRs.
+
+    %3:_(s32) = COPY $w0
+    %0:_(s1) = G_TRUNC %3(s32)
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $d1
+    %4:_(s64) = G_SELECT %0(s1), %1, %2
+    $x0 = COPY %4(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            one_fpr_input_fpr_output
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x1, $w0
+    ; CHECK-LABEL: name: one_fpr_input_fpr_output
+    ; CHECK: liveins: $d0, $x1, $w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr(s1) = COPY [[TRUNC]](s1)
+    ; CHECK: [[COPY4:%[0-9]+]]:fpr(s64) = COPY [[COPY2]](s64)
+    ; CHECK: [[SELECT:%[0-9]+]]:fpr(s64) = G_SELECT [[COPY3]](s1), [[COPY1]], [[COPY4]]
+    ; CHECK: $d0 = COPY [[SELECT]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+
+    ; Same idea as the above test. If the output is an FPR, and one of the
+    ; inputs is an FPR, then it's fewer copies to just do a FCSEL.
+
+    %3:_(s32) = COPY $w0
+    %0:_(s1) = G_TRUNC %3(s32)
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $x1
+    %4:_(s64) = G_SELECT %0(s1), %1, %2
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            one_fpr_input_gpr_output
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x1, $w0
+    ; CHECK-LABEL: name: one_fpr_input_gpr_output
+    ; CHECK: liveins: $d0, $x1, $w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK: [[SELECT:%[0-9]+]]:gpr(s64) = G_SELECT [[TRUNC]](s1), [[COPY3]], [[COPY2]]
+    ; CHECK: $x0 = COPY [[SELECT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+
+    ; Now we have more GPR registers on the G_SELECT. It's cheaper here to put
+    ; everything on GPR.
+
+    %3:_(s32) = COPY $w0
+    %0:_(s1) = G_TRUNC %3(s32)
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $x1
+    %4:_(s64) = G_SELECT %0(s1), %1, %2
+    $x0 = COPY %4(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            two_gpr_input_fpr_output
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $w0
+    ; CHECK-LABEL: name: two_gpr_input_fpr_output
+    ; CHECK: liveins: $x0, $x1, $w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr(s64) = COPY $x0
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+    ; CHECK: [[SELECT:%[0-9]+]]:gpr(s64) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK: $d0 = COPY [[SELECT]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+
+    ; Same as above. The G_SELECT should get all GPRS.
+
+    %3:_(s32) = COPY $w0
+    %0:_(s1) = G_TRUNC %3(s32)
+    %1:_(s64) = COPY $x0
+    %2:_(s64) = COPY $x1
+    %4:_(s64) = G_SELECT %0(s1), %1, %2
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0