[InstCombine] fold shuffles of insert_subvectors

This should be a valid exception to the general rule of not creating new shuffle masks in IR...
because we already do it. :)
Also, DAG combining/legalization will undo this by widening the shuffle back out if needed.

Explanation for how we already do this: SLP or vector source can create chains of insert/extract
as shown in 1 of the examples from PR16739:
https://godbolt.org/z/NlK7rA
https://bugs.llvm.org/show_bug.cgi?id=16739

And we expect instcombine or DAGCombine to clean that up by creating relatively simple shuffles.

Differential Revision: https://reviews.llvm.org/D62024

llvm-svn: 361338
diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst
index e7f96e8..4753acd 100644
--- a/llvm/docs/CodeGenerator.rst
+++ b/llvm/docs/CodeGenerator.rst
@@ -912,6 +912,31 @@
 (and which of the above three actions to take) by calling the
 ``setOperationAction`` method in its ``TargetLowering`` constructor.
 
+If a target has legal vector types, it is expected to produce efficient machine
+code for common forms of the shufflevector IR instruction using those types.
+This may require custom legalization for SelectionDAG vector operations that
+are created from the shufflevector IR. The shufflevector forms that should be
+handled include:
+
+* Vector select --- Each element of the vector is chosen from either of the
+corresponding elements of the 2 input vectors. This operation may also be
+known as a "blend" or "bitwise select" in target assembly. This type of shuffle
+maps directly to the ``shuffle_vector`` SelectionDAG node.
+
+* Insert subvector --- A vector is placed into a longer vector type starting
+at index 0. This type of shuffle maps directly to the ``insert_subvector``
+SelectionDAG node with the ``index`` operand set to 0.
+
+* Extract subvector --- A vector is pulled from a longer vector type starting
+at index 0. This type of shuffle maps directly to the ``extract_subvector``
+SelectionDAG node with the ``index`` operand set to 0.
+
+* Splat --- All elements of the vector have identical scalar elements. This
+operation may also be known as a "broadcast" or "duplicate" in target assembly.
+The shufflevector IR instruction may change the vector length, so this operation
+may map to multiple SelectionDAG nodes including ``shuffle_vector``,
+``concat_vectors``, ``insert_subvector``, and ``extract_subvector``.
+
 Prior to the existence of the Legalize passes, we required that every target
 `selector`_ supported and handled every operator and type even if they are not
 natively supported.  The introduction of the Legalize phases allows all of the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index e76f41b..ecc4df1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1622,6 +1622,55 @@
   return nullptr;
 }
 
+static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
+  // Match the operands as identity with padding (also known as concatenation
+  // with undef) shuffles of the same source type. The backend is expected to
+  // recreate these concatenations from a shuffle of narrow operands.
+  auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
+  auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
+  if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
+      !Shuffle1 || !Shuffle1->isIdentityWithPadding())
+    return nullptr;
+
+  // We limit this transform to power-of-2 types because we expect that the
+  // backend can convert the simplified IR patterns to identical nodes as the
+  // original IR.
+  // TODO: If we can verify that behavior for arbitrary types, the power-of-2
+  // checks can be removed.
+  Value *X = Shuffle0->getOperand(0);
+  Value *Y = Shuffle1->getOperand(0);
+  if (X->getType() != Y->getType() ||
+      !isPowerOf2_32(Shuf.getType()->getVectorNumElements()) ||
+      !isPowerOf2_32(Shuffle0->getType()->getVectorNumElements()) ||
+      !isPowerOf2_32(X->getType()->getVectorNumElements()) ||
+      isa<UndefValue>(X) || isa<UndefValue>(Y))
+    return nullptr;
+  assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
+         isa<UndefValue>(Shuffle1->getOperand(1)) &&
+         "Unexpected operand for identity shuffle");
+
+  // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
+  // operands directly by adjusting the shuffle mask to account for the narrower
+  // types:
+  // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
+  int NarrowElts = X->getType()->getVectorNumElements();
+  int WideElts = Shuffle0->getType()->getVectorNumElements();
+  assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
+
+  Type *I32Ty = IntegerType::getInt32Ty(Shuf.getContext());
+  SmallVector<int, 16> Mask = Shuf.getShuffleMask();
+  SmallVector<Constant *, 16> NewMask(Mask.size(), UndefValue::get(I32Ty));
+  for (int i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] < WideElts)
+      NewMask[i] = ConstantInt::get(I32Ty, Mask[i]);
+    else
+      NewMask[i] = ConstantInt::get(I32Ty, Mask[i] - (WideElts - NarrowElts));
+  }
+  return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1676,10 +1725,12 @@
   if (Instruction *I = foldIdentityExtractShuffle(SVI))
     return I;
 
-  // This transform has the potential to lose undef knowledge, so it is
+  // These transforms have the potential to lose undef knowledge, so they are
   // intentionally placed after SimplifyDemandedVectorElts().
   if (Instruction *I = foldShuffleWithInsert(SVI))
     return I;
+  if (Instruction *I = foldIdentityPaddedShuffles(SVI))
+    return I;
 
   if (VWidth == LHSWidth) {
     // Analyze the shuffle, are the LHS or RHS and identity shuffles?
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
index 821c082..3dcd7d7 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
@@ -1140,6 +1140,8 @@
   ret <2 x i1> %r
 }
 
+; Negative test - do not transform non-power-of-2 unless we know the backend handles these sequences identically.
+
 define <7 x i8> @insert_subvector_shuffles(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles(
 ; CHECK-NEXT:    [[S1:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1155,9 +1157,7 @@
 
 define <8 x i8> @insert_subvector_shuffles_pow2elts(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles_pow2elts(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <2 x i8> [[Y:%.*]], <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x i8> [[S1]], <8 x i8> [[S2]], <8 x i32> <i32 0, i32 8, i32 1, i32 undef, i32 8, i32 1, i32 9, i32 0>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <8 x i32> <i32 0, i32 2, i32 1, i32 undef, i32 2, i32 1, i32 3, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[S3]]
 ;
   %s1 = shufflevector <2 x i8> %x, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1167,6 +1167,7 @@
 }
 
 ; The last shuffle may change the vector type.
+; Negative test - do not transform non-power-of-2 unless we know the backend handles these sequences identically.
 
 define <2 x i8> @insert_subvector_shuffles_narrowing(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles_narrowing(
@@ -1183,9 +1184,7 @@
 
 define <2 x i8> @insert_subvector_shuffles_narrowing_pow2elts(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles_narrowing_pow2elts(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x i8> [[S1]], <8 x i8> [[S2]], <2 x i32> <i32 0, i32 8>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    ret <2 x i8> [[S3]]
 ;
   %s1 = shufflevector <4 x i8> %x, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1198,9 +1197,7 @@
 
 define <4 x double> @insert_subvector_shuffles_identity(<2 x double> %x) {
 ; CHECK-LABEL: @insert_subvector_shuffles_identity(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x double> [[S2]], <4 x double> [[S1]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x double> [[S3]]
 ;
   %s1 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>