[X86][AVX] Decode constant bits from insert_subvector(c1, c2, c3)

This mostly happens due to SimplifyDemandedVectorElts reducing a vector to insert_subvector(undef, c1, 0)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363499 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 16ddc7a..7863496 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5967,6 +5967,29 @@
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
+  // Insert constant bits from a base and sub vector sources.
+  if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      isa<ConstantSDNode>(Op.getOperand(2))) {
+    // TODO - support insert_subvector through bitcasts.
+    if (EltSizeInBits != VT.getScalarSizeInBits())
+      return false;
+
+    APInt UndefSubElts;
+    SmallVector<APInt, 32> EltSubBits;
+    if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+                                      UndefSubElts, EltSubBits,
+                                      AllowWholeUndefs, AllowPartialUndefs) &&
+        getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+                                      UndefElts, EltBits, AllowWholeUndefs,
+                                      AllowPartialUndefs)) {
+      unsigned BaseIdx = Op.getConstantOperandVal(2);
+      UndefElts.insertBits(UndefSubElts, BaseIdx);
+      for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
+        EltBits[BaseIdx + i] = EltSubBits[i];
+      return true;
+    }
+  }
+
   // Extract constant bits from a subvector's source.
   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       isa<ConstantSDNode>(Op.getOperand(1))) {
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 57e333d..685efde 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1805,10 +1805,8 @@
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,3]
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,1,12,2]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; CHECK-NEXT:    vpermt2d %ymm0, %ymm2, %ymm1
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index f8dffbc..45c2abc 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -207,23 +207,10 @@
   ret <8 x float> %1
 }
 define <8 x float> @demandedelts_vpermilvar_8f32_movsldup(<8 x float> %a0, i32 %a1) {
-; AVX1-LABEL: demandedelts_vpermilvar_8f32_movsldup:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = <u,0,2,2,4,4,6,6>
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,4,5,6,7]
-; AVX1-NEXT:    ret{{[l|q]}}
-;
-; AVX2-LABEL: demandedelts_vpermilvar_8f32_movsldup:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; AVX2-NEXT:    ret{{[l|q]}}
-;
-; AVX512-LABEL: demandedelts_vpermilvar_8f32_movsldup:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; AVX512-NEXT:    ret{{[l|q]}}
+; CHECK-LABEL: demandedelts_vpermilvar_8f32_movsldup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = insertelement <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>, i32 %a1, i32 0
   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index d796108..7fc295e 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -933,10 +933,7 @@
 ;
 ; X64-LABEL: combine_vpermi2var_8f64_as_permpd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovapd {{.*#+}} zmm2 = <u,2,1,3,4,6,5,7>
-; X64-NEXT:    vinsertf32x4 $0, {{.*}}(%rip), %zmm2, %zmm2
-; X64-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2
-; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5]
+; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6]
 ; X64-NEXT:    retq
   %res0 = insertelement <8 x i64> <i64 0, i64 2, i64 1, i64 3, i64 4, i64 6, i64 5, i64 7>, i64 %a2, i32 0
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index e785530..73a8cbe 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -155,10 +155,7 @@
 ;
 ; X64-LABEL: demandedelts_vpermil2pd256_as_shufpd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovapd {{.*#+}} xmm2 = <u,4,2,7>
-; X64-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3]
-; X64-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3]
+; X64-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm1[0,0],ymm0[3],ymm1[3]
 ; X64-NEXT:    retq
   %res0 = insertelement <4 x i64> <i64 0, i64 4, i64 2, i64 7>, i64 %a2, i32 0
   %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %res0, i8 0)