blob: 2a1bf73bd3a55a52a943267ee59a70247e396286 [file] [log] [blame]
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
cl::opt<bool>
llvm::RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
!Ty->isPPC_FP128Ty();
}
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
return false;
if (BB != I->getParent())
return false;
}
return true;
}
/// \returns True if all of the values in \p VL are constants.
static bool allConstant(ArrayRef<Value *> VL) {
for (Value *i : VL)
if (!isa<Constant>(i))
return false;
return true;
}
/// \returns True if all of the values in \p VL are identical.
static bool isSplat(ArrayRef<Value *> VL) {
for (unsigned i = 1, e = VL.size(); i < e; ++i)
if (VL[i] != VL[0])
return false;
return true;
}
/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
static bool isCommutative(Instruction *I) {
if (auto *IC = dyn_cast<CmpInst>(I))
return IC->isCommutative();
return I->isCommutative();
}
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %x0x0 = mul i8 %x0, %x0
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
/// ret <4 x i8> %ins4
/// can be transformed into:
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
/// We convert this initially to something like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
/// %9 = extractelement <4 x i8> %5, i32 3
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
/// TODO: Can we split off and reuse the shuffle mask detection from
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
if (Vec->getType()->getVectorNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
if (Idx->getValue().uge(Size))
continue;
unsigned IntIdx = Idx->getValue().getZExtValue();
// We can extractelement from undef vector.
if (isa<UndefValue>(Vec))
continue;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
if (!Vec1 || Vec1 == Vec)
Vec1 = Vec;
else if (!Vec2 || Vec2 == Vec)
Vec2 = Vec;
else
return None;
if (CommonShuffleMode == Permute)
continue;
// If the extract index is not the same as the operation number, it is a
// permutation.
if (IntIdx != I) {
CommonShuffleMode = Permute;
continue;
}
CommonShuffleMode = Select;
}
// If we're not crossing lanes in different vectors, consider it as blending.
if (CommonShuffleMode == Select && Vec2)
return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
namespace {
/// Main data required for vectorization of instructions.
struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
/// The main/alternate instruction.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
}
unsigned getAltOpcode() const {
return AltOp ? AltOp->getOpcode() : 0;
}
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
}
InstructionsState() = delete;
InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
};
} // end anonymous namespace
/// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
/// OpValue.
static Value *isOneOf(const InstructionsState &S, Value *Op) {
auto *I = dyn_cast<Instruction>(Op);
if (I && S.isOpcodeOrAlt(I))
return Op;
return S.OpValue;
}
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
unsigned BaseIndex = 0) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
}
} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
}
}
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
cast<Instruction>(VL[AltIndex]));
}
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
return false;
return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static Optional<unsigned> getExtractIndex(Instruction *E) {
unsigned Opcode = E->getOpcode();
assert((Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue) &&
"Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
if (!CI)
return None;
return CI->getZExtValue();
}
ExtractValueInst *EI = cast<ExtractValueInst>(E);
if (EI->getNumIndices() != 1)
return None;
return *EI->idx_begin();
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
if (hasVectorInstrinsicScalarOpd(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
}
default:
return false;
}
}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
}
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
}
namespace llvm {
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
using ValueList = SmallVector<Value *, 8>;
using InstrList = SmallVector<Instruction *, 16>;
using ValueSet = SmallPtrSet<Value *, 16>;
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
else
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
int getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
/// into account (anf updating it, if required) list of externally used
/// values stored in \p ExternallyUsedValues.
void buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
NumOpsWantToKeepOrder.clear();
NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
}
MinBWs.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// \returns The best order of instructions for vectorization.
Optional<ArrayRef<unsigned>> bestOrder() const {
auto I = std::max_element(
NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
return D1.second < D2.second;
});
if (I == NumOpsWantToKeepOrder.end() ||
I->getSecond() <= NumOpsWantToKeepOriginalOrder)
return None;
return makeArrayRef(I->getFirst());
}
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V) const;
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
}
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
}
/// Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable() const;
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
/// during buildTree_rec(). We keep track of:
/// (i) the user TreeEntry index, and
/// (ii) the index of the edge.
struct EdgeInfo {
EdgeInfo() = default;
/// The index of the user TreeEntry in VectorizableTree.
int Idx = -1;
/// The operand index of the use.
unsigned EdgeIdx = UINT_MAX;
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &OS,
const BoUpSLP::EdgeInfo &EI) {
EI.dump(OS);
return OS;
}
/// Debug print.
void dump(raw_ostream &OS) const {
OS << "{User:" << Idx << " EdgeIdx:" << EdgeIdx << "}";
}
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
#endif
};
/// A helper data structure to hold the operands of a vector of instructions.
/// This supports a fixed vector length for all operand vectors.
class VLOperands {
/// For each operand we need (i) the value, and (ii) the opcode that it
/// would be attached to if the expression was in a left-linearized form.
/// This is required to avoid illegal operand reordering.
/// For example:
/// \verbatim
/// 0 Op1
/// |/
/// Op1 Op2 Linearized + Op2
/// \ / ----------> |/
/// - -
///
/// Op1 - Op2 (0 + Op1) - Op2
/// \endverbatim
///
/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
///
/// Another way to think of this is to track all the operations across the
/// path from the operand all the way to the root of the tree and to
/// calculate the operation that corresponds to this path. For example, the
/// path from Op2 to the root crosses the RHS of the '-', therefore the
/// corresponding operation is a '-' (which matches the one in the
/// linearized tree, as shown above).
///
/// For lack of a better term, we refer to this operation as Accumulated
/// Path Operation (APO).
struct OperandData {
OperandData() = default;
OperandData(Value *V, bool APO, bool IsUsed)
: V(V), APO(APO), IsUsed(IsUsed) {}
/// The operand value.
Value *V = nullptr;
/// TreeEntries only allow a single opcode, or an alternate sequence of
/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
/// APO. It is set to 'true' if 'V' is attached to an inverse operation
/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
/// (e.g., Add/Mul)
bool APO = false;
/// Helper data for the reordering function.
bool IsUsed = false;
};
/// During operand reordering, we are trying to select the operand at lane
/// that matches best with the operand at the neighboring lane. Our
/// selection is based on the type of value we are looking for. For example,
/// if the neighboring lane has a load, we need to look for a load that is
/// accessing a consecutive address. These strategies are summarized in the
/// 'ReorderingMode' enumerator.
enum class ReorderingMode {
Load, ///< Matching loads to consecutive memory addresses
Opcode, ///< Matching instructions based on opcode (same or alternate)
Constant, ///< Matching constants
Splat, ///< Matching the same instruction multiple times (broadcast)
Failed, ///< We failed to create a vectorizable group
};
using OperandDataVec = SmallVector<OperandData, 2>;
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
const DataLayout &DL;
ScalarEvolution &SE;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
return OpsVec[OpIdx][Lane];
}
/// \returns the operand data at \p OpIdx and \p Lane. Const version.
const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
return OpsVec[OpIdx][Lane];
}
/// Clears the used flag for all entries.
void clearUsed() {
for (unsigned OpIdx = 0, NumOperands = getNumOperands();
OpIdx != NumOperands; ++OpIdx)
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
++Lane)
OpsVec[OpIdx][Lane].IsUsed = false;
}
/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
}
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return None.
Optional<unsigned>
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
ArrayRef<ReorderingMode> ReorderingModes) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
Value *OpLastLane = getData(OpIdx, LastLane).V;
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
const unsigned BestScore = 2;
const unsigned GoodScore = 1;
// The best operand index and its score.
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
// are using the score to differentiate between the two.
struct BestOpData {
Optional<unsigned> Idx = None;
unsigned Score = 0;
} BestOp;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
OperandData &OpData = getData(Idx, Lane);
Value *Op = OpData.V;
bool OpAPO = OpData.APO;
// Skip already selected operands.
if (OpData.IsUsed)
continue;
// Skip if we are trying to move the operand to a position with a
// different opcode in the linearized tree form. This would break the
// semantics.
if (OpAPO != OpIdxAPO)
continue;
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
if (isa<LoadInst>(Op)) {
// Figure out which is left and right, so that we can check for
// consecutive loads
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
cast<LoadInst>(OpRight), DL, SE))
BestOp.Idx = Idx;
}
break;
case ReorderingMode::Opcode:
// We accept both Instructions and Undefs, but with different scores.
if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
cast<Instruction>(Op)->getOpcode() ==
cast<Instruction>(OpLastLane)->getOpcode()) ||
(isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
isa<UndefValue>(Op)) {
// An instruction has a higher score than an undef.
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
if (Score > BestOp.Score) {
BestOp.Idx = Idx;
BestOp.Score = Score;
}
}
break;
case ReorderingMode::Constant:
if (isa<Constant>(Op)) {
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
if (Score > BestOp.Score) {
BestOp.Idx = Idx;
BestOp.Score = Score;
}
}
break;
case ReorderingMode::Splat:
if (Op == OpLastLane)
BestOp.Idx = Idx;
break;
case ReorderingMode::Failed:
return None;
}
}
if (BestOp.Idx) {
getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
return BestOp.Idx;
}
// If we could not find a good match return None.
return None;
}
/// Helper for reorderOperandVecs. \Returns the lane that we should start
/// reordering from. This is the one which has the least number of operands
/// that can freely move about.
unsigned getBestLaneToStartReordering() const {
unsigned BestLane = 0;
unsigned Min = UINT_MAX;
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
++Lane) {
unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
if (NumFreeOps < Min) {
Min = NumFreeOps;
BestLane = Lane;
}
}
return BestLane;
}
/// \Returns the maximum number of operands that are allowed to be reordered
/// for \p Lane. This is used as a heuristic for selecting the first lane to
/// start operand reordering.
unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
unsigned CntTrue = 0;
unsigned NumOperands = getNumOperands();
// Operands with the same APO can be reordered. We therefore need to count
// how many of them we have for each APO, like this: Cnt[APO] = x.
// Since we only have two APOs, namely true and false, we can avoid using
// a map. Instead we can simply count the number of operands that
// correspond to one of them (in this case the 'true' APO), and calculate
// the other by subtracting it from the total number of operands.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
if (getData(OpIdx, Lane).APO)
++CntTrue;
unsigned CntFalse = NumOperands - CntTrue;
return std::max(CntTrue, CntFalse);
}
/// Go through the instructions in VL and append their operands.
void appendOperandsOfVL(ArrayRef<Value *> VL) {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
OpsVec[OpIdx].resize(NumLanes);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
// RHS operand. The LHS operand of both add and sub is never attached
// to an inversese operation in the linearized form, therefore its APO
// is false. The RHS is true only if VL[Lane] is an inverse operation.
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely
// tell the inverse operations by checking commutativity.
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
APO, false};
}
}
}
/// \returns the number of operands.
unsigned getNumOperands() const { return OpsVec.size(); }
/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
/// \returns the operand value at \p OpIdx and \p Lane.
Value *getValue(unsigned OpIdx, unsigned Lane) const {
return getData(OpIdx, Lane).V;
}
/// \returns true if the data structure is empty.
bool empty() const { return OpsVec.empty(); }
/// Clears the data.
void clear() { OpsVec.clear(); }
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
ScalarEvolution &SE)
: DL(DL), SE(SE) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
/// \Returns a value vector with the operands across all lanes for the
/// opearnd at \p OpIdx.
ValueList getVL(unsigned OpIdx) const {
ValueList OpVL(OpsVec[OpIdx].size());
assert(OpsVec[OpIdx].size() == getNumLanes() &&
"Expected same num of lanes across all operands");
for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
OpVL[Lane] = OpsVec[OpIdx][Lane].V;
return OpVL;
}
// Performs operand reordering for 2 or more operands.
// The original operands are in OrigOps[OpIdx][Lane].
// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
void reorder() {
unsigned NumOperands = getNumOperands();
unsigned NumLanes = getNumLanes();
// Each operand has its own mode. We are using this mode to help us select
// the instructions for each lane, so that they match best with the ones
// we have selected so far.
SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
// This is a greedy single-pass algorithm. We are going over each lane
// once and deciding on the best order right away with no back-tracking.
// However, in order to increase its effectiveness, we start with the lane
// that has operands that can move the least. For example, given the
// following lanes:
// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
// we will start at Lane 1, since the operands of the subtraction cannot
// be reordered. Then we will visit the rest of the lanes in a circular
// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
// Find the first lane that we will start our search from.
unsigned FirstLane = getBestLaneToStartReordering();
// Initialize the modes.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
if (isa<LoadInst>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Load;
else if (isa<Instruction>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
else if (isa<Constant>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Constant;
else if (isa<Argument>(OpLane0))
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
else
// NOTE: This should be unreachable.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
}
// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {
// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;
// Mark the operand data as free to use for all but the first pass.
if (Pass > 0)
clearUsed();
// We keep the original operand order for the FirstLane, so reorder the
// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius
// distance.
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
int Lane = FirstLane + Direction * Distance;
if (Lane < 0 || Lane >= (int)NumLanes)
continue;
int LastLane = Lane - Direction;
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds");
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
Optional<unsigned> BestIdx =
getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
if (BestIdx) {
// Swap the current operand with the one returned by
// getBestOperand().
swap(OpIdx, BestIdx.getValue(), Lane);
} else {
// We failed to find a best operand, set mode to 'Failed'.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
}
}
}
// Skip second pass if the strategy did not fail.
if (!StrategyFailed)
break;
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
switch (RMode) {
case ReorderingMode::Load:
return "Load";
case ReorderingMode::Opcode:
return "Opcode";
case ReorderingMode::Constant:
return "Constant";
case ReorderingMode::Splat:
return "Splat";
case ReorderingMode::Failed:
return "Failed";
}
llvm_unreachable("Unimplemented Reordering Type");
}
LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
raw_ostream &OS) {
return OS << getModeStr(RMode);
}
/// Debug print.
LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
printMode(RMode, dbgs());
}
friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
return printMode(RMode, OS);
}
LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
const unsigned Indent = 2;
unsigned Cnt = 0;
for (const OperandDataVec &OpDataVec : OpsVec) {
OS << "Operand " << Cnt++ << "\n";
for (const OperandData &OpData : OpDataVec) {
OS.indent(Indent) << "{";
if (Value *V = OpData.V)
OS << *V;
else
OS << "null";
OS << ", APO:" << OpData.APO << "}\n";
}
OS << "\n";
}
return OS;
}
/// Debug print.
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
#endif
};
private:
struct TreeEntry;
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, EdgeInfo EI);
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(ArrayRef<Value *> VL,
const InstructionsState &S);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree() const;
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
ScalarEvolution &SE);
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
if (VL.size() == Scalars.size())
return std::equal(VL.begin(), VL.end(), Scalars.begin());
return VL.size() == ReuseShuffleIndices.size() &&
std::equal(
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
[this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
bool NeedToGather = false;
/// Does this sequence require some shuffling?
SmallVector<unsigned, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
ArrayRef<unsigned> ReorderIndices;
/// Points back to the VectorizableTree.
///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
/// to be a pointer and needs to be able to initialize the child iterator.
/// Thus we need a reference back to the container to translate the indices
/// to entries.
VecTreeTy &Container;
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
SmallVector<EdgeInfo, 1> UserTreeIndices;
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
/// Note: This helps avoid the replication of the code that performs the
/// reordering of operands during buildTree_rec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
public:
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
ArrayRef<unsigned> ReuseShuffleIndices) {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].size() == 0 && "Already resized?");
Operands[OpIdx].resize(Scalars.size());
for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
? OpVL[ReuseShuffleIndices[Lane]]
: OpVL[Lane];
}
/// If there is a user TreeEntry, then set its operand.
void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
ArrayRef<Value *> OpVL,
ArrayRef<unsigned> ReuseShuffleIndices) {
if (UserTreeIdx.Idx >= 0) {
auto &VectorizableTree = Container;
VectorizableTree[UserTreeIdx.Idx]->setOperand(UserTreeIdx.EdgeIdx, OpVL,
ReuseShuffleIndices);
}
}
/// \returns the \p OpIdx operand of this TreeEntry.
ValueList &getOperand(unsigned OpIdx) {
assert(OpIdx < Operands.size() && "Off bounds");
return Operands[OpIdx];
}
/// \return the single \p OpIdx operand.
Value *getSingleOperand(unsigned OpIdx) const {
assert(OpIdx < Operands.size() && "Off bounds");
assert(!Operands[OpIdx].empty() && "No operand availabe");
return Operands[OpIdx][0];
}
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
dbgs() << "Operand " << OpI << ":\n";
for (const Value *V : Operands[OpI])
dbgs().indent(2) << *V << "\n";
}
dbgs() << "Scalars: \n";
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "NeedToGather: " << NeedToGather << "\n";
dbgs() << "VectorizedValue: ";
if (VectorizedValue)
dbgs() << *VectorizedValue;
else
dbgs() << "NULL";
dbgs() << "\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
dbgs() << "Emtpy";
else
for (unsigned Idx : ReuseShuffleIndices)
dbgs() << Idx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
for (unsigned Idx : ReorderIndices)
dbgs() << Idx << ", ";
dbgs() << "\n";
dbgs() << "UserTreeIndices: ";
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
}
#endif
};
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
TreeEntry *Last = VectorizableTree.back().get();
int idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = idx;
}
} else {
MustGather.insert(VL.begin(), VL.end());
}
if (UserTreeIdx.Idx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx);
Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
UserTreeIdx.Idx = idx;
return Last;
}
/// -- Vectorization State --
/// Holds all of the tree entries.
TreeEntry::VecTreeTy VectorizableTree;
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dumpVectorizableTree() const {
for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
dbgs() << Id << ".\n";
VectorizableTree[Id]->dump();
dbgs() << "\n";
}
}
#endif
TreeEntry *getTreeEntry(Value *V) {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return VectorizableTree[I->second].get();
return nullptr;
}
const TreeEntry *getTreeEntry(Value *V) const {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return VectorizableTree[I->second].get();
return nullptr;
}
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
: Scalar(S), User(U), Lane(L) {}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
llvm::User *User;
// Which lane does the scalar belong to.
int Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
/// Checks if two instructions may access the same memory.
///
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result.hasValue()) {
return result.getValue();
}
MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
aliased = AA->alias(Loc1, Loc2);
}
// Store the result in the cache.
result = aliased;
return aliased;
}
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
void eraseInstruction(Instruction *I) {
I->removeFromParent();
I->dropAllReferences();
DeletedInstructions.emplace_back(I);
}
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
SmallVector<unique_value, 8> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
/// after vectorization.
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
ScheduleData() = default;
void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
OpValue = OpVal;
}
/// Returns true if the dependency information has been calculated.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
/// (= the head of a bundle).
bool isSchedulingEntity() const { return FirstInBundle == this; }
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
return NextInBundle != nullptr || FirstInBundle != this;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
/// unscheduled depending instructions/bundles.
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
return UnscheduledDepsInBundle == 0 && !IsScheduled;
}
/// Modifies the number of unscheduled dependencies, also updating it for
/// the whole bundle.
int incrementUnscheduledDeps(int Incr) {
UnscheduledDeps += Incr;
return FirstInBundle->UnscheduledDepsInBundle += Incr;
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
}
/// Clears all dependency information.
void clearDependencies() {
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
}
void dump(raw_ostream &os) const {
if (!isSchedulingEntity()) {
os << "/ " << *Inst;
} else if (NextInBundle) {
os << '[' << *Inst;
ScheduleData *SD = NextInBundle;
while (SD) {
os << ';' << *SD->Inst;
SD = SD->NextInBundle;
}
os << ']';
} else {
os << *Inst;
}
}
Instruction *Inst = nullptr;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle = nullptr;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
ScheduleData *NextInBundle = nullptr;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
ScheduleData *NextLoadStore = nullptr;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID = 0;
/// Used for getting a "good" final ordering of instructions.
int SchedulingPriority = 0;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
int Dependencies = InvalidDeps;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps = InvalidDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
int UnscheduledDepsInBundle = InvalidDeps;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
/// Opcode of the current instruction in the schedule data.
Value *OpValue = nullptr;
};
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &os,
const BoUpSLP::ScheduleData &SD) {
SD.dump(os);
return os;
}
#endif
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
void clear() {
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
ScheduleRegionSizeLimit -= ScheduleRegionSize;
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
ScheduleRegionSizeLimit = MinScheduleRegionSize;
ScheduleRegionSize = 0;
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
}
ScheduleData *getScheduleData(Value *V) {
ScheduleData *SD = ScheduleDataMap[V];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
}
ScheduleData *getScheduleData(Value *V, Value *Key) {
if (V == Key)
return getScheduleData(V);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end()) {
ScheduleData *SD = I->second[Key];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
}
return nullptr;
}
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
if (BundleMember->Inst != BundleMember->OpValue) {
BundleMember = BundleMember->NextInBundle;
continue;
}
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
auto *I = dyn_cast<Instruction>(U.get());
if (!I)
continue;
doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after
// decrementing, so we can put the dependent instruction
// into the ready list.
ScheduleData *DepBundle = OpDef->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
LLVM_DEBUG(dbgs()
<< "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
LLVM_DEBUG(dbgs()
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
void doForAllOpcodes(Value *V,
function_ref<void(ScheduleData *SD)> Action) {
if (ScheduleData *SD = getScheduleData(V))
Action(SD);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end())
for (auto &P : I->second)
if (P.second->SchedulingRegionID == SchedulingRegionID)
Action(P.second);
}
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
LLVM_DEBUG(dbgs()
<< "SLP: initially in ready list: " << *I << "\n");
}
});
}
}
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
void initScheduleData(Instruction *FromI, Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
BoUpSLP *SLP);
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
/// The allocator position in the current chunk, which is the last entry
/// of ScheduleDataChunks.
int ChunkPos;
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
/// Attaches ScheduleData to Instruction with the leading key.
DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
ExtraScheduleDataMap;
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
/// The ready-list for scheduling (only used for the dry-run).
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart = nullptr;
/// The first instruction _after_ the scheduling region.
Instruction *ScheduleEnd = nullptr;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *FirstLoadStoreInRegion = nullptr;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
/// The current size of the scheduling region.
int ScheduleRegionSize = 0;
/// The maximum size allowed for the scheduling region.
int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
int SchedulingRegionID = 1;
};
/// Attaches the BlockScheduling structures to basic blocks.
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
using OrdersType = SmallVector<unsigned, 4>;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
struct OrdersTypeDenseMapInfo {
static OrdersType getEmptyKey() {
OrdersType V;
V.push_back(~1U);
return V;
}
static OrdersType getTombstoneKey() {
OrdersType V;
V.push_back(~2U);
return V;
}
static unsigned getHashValue(const OrdersType &V) {
return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
}
static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
return LHS == RHS;
}
};
/// Contains orders of operations along with the number of bundles that have
/// operations in this order. It stores only those orders that require
/// reordering, if reordering is not required it is counted using \a
/// NumOpsWantToKeepOriginalOrder.
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
/// Number of bundles that do not require reordering.
unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
OptimizationRemarkEmitter *ORE;
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
/// can legally be represented. The values map to (width, signed) pairs,
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace slpvectorizer
template <> struct GraphTraits<BoUpSLP *> {
using TreeEntry = BoUpSLP::TreeEntry;
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
/// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<
ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
ContainerTy &VectorizableTree;
ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
ContainerTy &VT)
: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
NodeRef operator*() { return VectorizableTree[I->Idx].get(); }
};
static NodeRef getEntryNode(BoUpSLP &R) {
return R.VectorizableTree[0].get();
}
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
}
static ChildIteratorType child_end(NodeRef N) {
return {N->UserTreeIndices.end(), N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
class nodes_iterator {
using ItTy = ContainerTy::iterator;
ItTy It;
public:
nodes_iterator(const ItTy &It2) : It(It2) {}
NodeRef operator*() { return It->get(); }
nodes_iterator operator++() {
++It;
return *this;
}
bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
};
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
}
static nodes_iterator nodes_end(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.end());
}
static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
};
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
using TreeEntry = BoUpSLP::TreeEntry;
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
raw_string_ostream OS(Str);
if (isSplat(Entry->Scalars)) {
OS << "<splat> " << *Entry->Scalars[0];
return Str;
}
for (auto V : Entry->Scalars) {
OS << *V;
if (std::any_of(
R->ExternalUses.begin(), R->ExternalUses.end(),
[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
OS << " <extract>";
OS << "\n";
}
return Str;
}
static std::string getNodeAttributes(const TreeEntry *Entry,
const BoUpSLP *) {
if (Entry->NeedToGather)
return "color=red";
return "";
}
};
} // end namespace llvm
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
}
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0, EdgeInfo());
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
int FoundLane = Lane;
if (!Entry->ReuseShuffleIndices.empty()) {
FoundLane =
std::distance(Entry->ReuseShuffleIndices.begin(),
llvm::find(Entry->ReuseShuffleIndices, FoundLane));
}
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
// Skip in-tree scalars that become vectors
if (TreeEntry *UseEntry = getTreeEntry(U)) {
Value *UseScalar = UseEntry->Scalars[0];
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!UseEntry->NeedToGather && "Bad state");
continue;
}
}
// Ignore users in the user ignore list.
if (is_contained(UserIgnoreList, UserInst))
continue;
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
}
}
}
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
EdgeInfo UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// We now know that this is a vector of instructions of the same type from
// the same block.
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
<< ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
<< ".\n");
E->trySetUserTEOperand(UserTreeIdx, VL, None);
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
auto *I = dyn_cast<Instruction>(VL[i]);
if (!I)
continue;
if (getTreeEntry(I)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
<< ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
auto *VL0 = cast<Instruction>(S.OpValue);
BasicBlock *BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Check that every instruction appears once in this bundle.
SmallVector<unsigned, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second)
UniqueValues.emplace_back(V);
}
if (UniqueValues.size() == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
VL = UniqueValues;
}
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = llvm::make_unique<BlockScheduling>(BB);
BlockScheduling &BS = *BSRef.get();
if (!BS.tryScheduleBundle(VL, this, S)) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
unsigned ShuffleOrOp = S.isAltShuffle() ?
(unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
Instruction *Term = dyn_cast<Instruction>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies);
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
if (!CurrentOrder.empty()) {
LLVM_DEBUG({
dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order";
for (unsigned Idx : CurrentOrder)
dbgs() << " " << Idx;
dbgs() << "\n";
});
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
auto StoredCurrentOrderAndNum =
NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++StoredCurrentOrderAndNum->getSecond();
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
StoredCurrentOrderAndNum->getFirst());
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
BS.cancelScheduling(VL, VL0);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
Type *ScalarTy = VL0->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
SmallVector<Value *, 4> PointerOps(VL.size());
auto POIter = PointerOps.begin();
for (Value *V : VL) {
auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
*POIter = L->getPointerOperand();
++POIter;
}
OrdersType CurrentOrder;
// Check the order of pointer operands.
if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
Ptr0 = PointerOps.front();
PtrN = PointerOps.back();
} else {
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
}
const SCEV *Scev0 = SE->getSCEV(Ptr0);
const SCEV *ScevN = SE->getSCEV(PtrN);
const auto *Diff =
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
// Check that the sorted loads are consecutive.
if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++I->getSecond();
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies, I->getFirst());
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
return;
}
}
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering casts with different src types.\n");
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering cmp with different predicate.\n");
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
ValueList Left, Right;
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
if (Cmp->getPredicate() != P0)
std::swap(LHS, RHS);
Left.push_back(LHS);
Right.push_back(RHS);
}
}
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Left, Depth + 1, UserTreeIdx);
UserTreeIdx.EdgeIdx = 1;
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
case Instruction::Select:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Left, Depth + 1, UserTreeIdx);
UserTreeIdx.EdgeIdx = 1;
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = VL0->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
// We don't combine GEPs with non-constant indexes.
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::Store: {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
return;
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
CallInst *CI = cast<CallInst>(VL0);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call