blob: 5bcf0c0a7ba6364bd0bbae1a20d5ac09d1336853 [file] [log] [blame]
//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
// and generates target-independent LLVM-IR.
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
// of instructions in order to estimate the profitability of vectorization.
//
// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
// by the SIMD vector width, and not by one.
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
// 2. LoopVectorizationLegality - A unit that checks for the legality
// of the vectorization.
// 3. InnerLoopVectorizer - A unit that performs the actual
// widening of instructions.
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
//
// Variable uniformity checks are inspired by:
// Karrenberg, R. and Hack, S. Whole Function Vectorization.
//
// The interleaved access vectorization is based on the paper:
// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
// Data for SIMD
//
// Other ideas/concepts are from:
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
//
// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
// Vectorizing Compilers.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "VPlan.h"
#include "VPlanBuilder.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <functional>
#include <iterator>
#include <limits>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
static cl::opt<unsigned> TinyTripCountVectorThreshold(
"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
cl::desc("Loops with a constant trip count that is smaller than this "
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));
static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
/// Maximum factor for an interleaved memory access.
static cl::opt<unsigned> MaxInterleaveGroupFactor(
"max-interleave-group-factor", cl::Hidden,
cl::desc("Maximum factor for an interleaved access group (default = 8)"),
cl::init(8));
/// We don't interleave loops with a known constant trip count below this
/// number.
static const unsigned TinyTripCountInterleaveThreshold = 128;
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"scalar loops."));
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
static cl::opt<unsigned> ForceTargetInstructionCost(
"force-target-instruction-cost", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's expected cost for "
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc(
"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
// Runtime interleave loops for load/store throughput.
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
"enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
cl::desc("Max number of stores to be predicated behind an if."));
static cl::opt<bool> EnableIndVarRegisterHeur(
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
cl::desc("Count the induction variable only once when interleaving"));
static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(true), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
static cl::opt<unsigned> MaxNestedScalarReductionIC(
"max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
"pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks with a "
"vectorize(enable) pragma."));
static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
"vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
cl::desc("The maximum number of SCEV checks allowed."));
static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
"pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
/// Create an analysis remark that explains why vectorization failed
///
/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
/// RemarkName is the identifier for the remark. If \p I is passed it is an
/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
/// the location of the remark. \return the remark object that can be
/// streamed to.
static OptimizationRemarkAnalysis
createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
Instruction *I = nullptr) {
Value *CodeRegion = TheLoop->getHeader();
DebugLoc DL = TheLoop->getStartLoc();
if (I) {
CodeRegion = I->getParent();
// If there is no debug location attached to the instruction, revert back to
// using the loop's.
if (I->getDebugLoc())
DL = I->getDebugLoc();
}
OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
R << "loop not vectorized: ";
return R;
}
namespace {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
} // end anonymous namespace
/// Returns true if the given loop body has a cycle, excluding the loop
/// itself.
static bool hasCyclesInLoopBody(const Loop &L) {
if (!L.empty())
return true;
for (const auto &SCC :
make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
scc_iterator<Loop, LoopBodyTraits>::end(L))) {
if (SCC.size() > 1) {
DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
DEBUG(L.dump());
return true;
}
}
return false;
}
/// A helper function for converting Scalar types to vector types.
/// If the incoming type is void, we return void. If the VF is 1, we return
/// the scalar type.
static Type *ToVectorTy(Type *Scalar, unsigned VF) {
if (Scalar->isVoidTy() || VF == 1)
return Scalar;
return VectorType::get(Scalar, VF);
}
// FIXME: The following helper functions have multiple implementations
// in the project. They can be effectively organized in a common Load/Store
// utilities unit.
/// A helper function that returns the pointer operand of a load or store
/// instruction.
static Value *getPointerOperand(Value *I) {
if (auto *LI = dyn_cast<LoadInst>(I))
return LI->getPointerOperand();
if (auto *SI = dyn_cast<StoreInst>(I))
return SI->getPointerOperand();
return nullptr;
}
/// A helper function that returns the type of loaded or stored value.
static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction");
if (auto *LI = dyn_cast<LoadInst>(I))
return LI->getType();
return cast<StoreInst>(I)->getValueOperand()->getType();
}
/// A helper function that returns the alignment of load or store instruction.
static unsigned getMemInstAlignment(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction");
if (auto *LI = dyn_cast<LoadInst>(I))
return LI->getAlignment();
return cast<StoreInst>(I)->getAlignment();
}
/// A helper function that returns the address space of the pointer operand of
/// load or store instruction.
static unsigned getMemInstAddressSpace(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction");
if (auto *LI = dyn_cast<LoadInst>(I))
return LI->getPointerAddressSpace();
return cast<StoreInst>(I)->getPointerAddressSpace();
}
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
auto *VectorTy = VectorType::get(Ty, VF);
return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
}
// If the vectorization factor is one, we just check if an array of type Ty
// requires padding between elements.
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
/// A helper function that returns the reciprocal of the block probability of
/// predicated blocks. If we return X, we are assuming the predicated block
/// will execute once for for every X iterations of the loop header.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
static unsigned getReciprocalPredBlockProb() { return 2; }
/// A helper function that adds a 'fast' flag to floating-point operations.
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
}
/// A helper function that returns an integer or floating-point constant with
/// value C.
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
: ConstantFP::get(Ty, C);
}
namespace llvm {
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
/// scalars. This class also implements the following features:
/// * It inserts an epilogue loop for handling loops that don't have iteration
/// counts that are known to be a multiple of the vectorization factor.
/// * It handles the code generation for reduction variables.
/// * Scalarization (implementation using scalars) of un-vectorizable
/// instructions.
/// InnerLoopVectorizer does not perform any vectorization-legality
/// checks, and relies on the caller to check for the different legality
/// aspects. The InnerLoopVectorizer relies on the
/// LoopVectorizationLegality class to provide information about the induction
/// and reduction variables that were found to a given vectorization factor.
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()),
VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
virtual ~InnerLoopVectorizer() = default;
/// Create a new empty loop. Unlink the old loop and connect the new one.
/// Return the pre-header block of the new loop.
BasicBlock *createVectorizedLoopSkeleton();
/// Widen a single instruction within the innermost loop.
void widenInstruction(Instruction &I);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
/// inclusive..
void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
bool IfPredicateInstr);
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
/// vector or scalar value on-demand if one is not yet available. When
/// vectorizing a loop, we visit the definition of an instruction before its
/// uses. When visiting the definition, we either vectorize or scalarize the
/// instruction, creating an entry for it in the corresponding map. (In some
/// cases, such as induction variables, we will create both vector and scalar
/// entries.) Then, as we encounter uses of the definition, we derive values
/// for each scalar or vector use unless such a value is already available.
/// For example, if we scalarize a definition and one of its uses is vector,
/// we build the required vector on-demand with an insertelement sequence
/// when visiting the use. Otherwise, if the use is scalar, we can use the
/// existing scalar definition.
///
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll index \p Part. If the value has already been vectorized,
/// the corresponding vector entry in VectorLoopValueMap is returned. If,
/// however, the value has a scalar entry in VectorLoopValueMap, we construct
/// a new vector value on-demand by inserting the scalar values into a vector
/// with an insertelement sequence. If the value has been neither vectorized
/// nor scalarized, it must be loop invariant, so we simply broadcast the
/// value into a vector.
Value *getOrCreateVectorValue(Value *V, unsigned Part);
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll and vector indices \p Instance. If the value has been
/// vectorized but not scalarized, the necessary extractelement instruction
/// will be generated.
Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
/// Try to vectorize the interleaved access group that \p Instr belongs to.
void vectorizeInterleaveGroup(Instruction *Instr);
/// Vectorize Load and Store instructions, optionally masking the vector
/// operations if \p BlockInMask is non-null.
void vectorizeMemoryInstruction(Instruction *Instr,
VectorParts *BlockInMask = nullptr);
/// \brief Set the debug location in the builder using the debug location in
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
protected:
friend class LoopVectorizationPlanner;
/// A small list of PHINodes.
using PhiVector = SmallVector<PHINode *, 4>;
/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// vectorization factor.
using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *CountRoundDown, Value *EndValue,
BasicBlock *MiddleBlock);
/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();
/// Fix a first-order recurrence. This is the second phase of vectorizing
/// this phi node.
void fixFirstOrderRecurrence(PHINode *Phi);
/// Fix a reduction cross-iteration phi. This is the second phase of
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
/// \brief The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
/// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// this is needed because each iteration in the loop corresponds to a SIMD
/// element.
virtual Value *getBroadcastInstrs(Value *V);
/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
/// \p Opcode is relevant for FP induction variable.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
Instruction::BinaryOpsEnd);
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
/// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
/// can be a truncate instruction).
void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
const InductionDescriptor &ID);
/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
/// node, and \p Step is the loop-invariant step. If \p EntryVal is a
/// truncate instruction, instead of widening the original IV, we widen a
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Instruction *EntryVal);
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
/// If there is a cast involved in the induction variable \p ID, which should
/// be ignored in the vectorized loop body, this function records the
/// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
/// cast. We had already proved that the casted Phi is equal to the uncasted
/// Phi in the vectorized loop (under a runtime guard), and therefore
/// there is no need to vectorize the cast - the same value can be used in the
/// vector loop for both the Phi and the cast.
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
Value *VectorLoopValue,
unsigned Part,
unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(Loop *NewLoop);
/// Returns a bitcasted value to the requested vector type.
/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
const DataLayout &DL);
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
/// inserted memchecks. Use this for instructions that are *cloned* into the
/// vector loop.
void addNewMetadata(Instruction *To, const Instruction *Orig);
/// Add metadata from one instruction to another.
///
/// This includes both the original MDs from \p From and additional ones (\see
/// addNewMetadata). Use this for *newly created* instructions in the vector
/// loop.
void addMetadata(Instruction *To, Instruction *From);
/// \brief Similar to the previous function but it adds the metadata to a
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
/// The original loop.
Loop *OrigLoop;
/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// dynamic knowledge to simplify SCEV expressions and converts them to a
/// more usable form.
PredicatedScalarEvolution &PSE;
/// Loop Info.
LoopInfo *LI;
/// Dominator Tree.
DominatorTree *DT;
/// Alias Analysis.
AliasAnalysis *AA;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// Target Transform Info.
const TargetTransformInfo *TTI;
/// Assumption Cache.
AssumptionCache *AC;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
/// \brief LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
///
/// This is currently only used to add no-alias metadata based on the
/// memchecks. The actually versioning is performed manually.
std::unique_ptr<LoopVersioning> LVer;
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
/// The builder that we use
IRBuilder<> Builder;
// --- Vectorization state ---
/// The vector-loop preheader.
BasicBlock *LoopVectorPreHeader;
/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader;
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
/// The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;
/// The vector loop body.
BasicBlock *LoopVectorBody;
/// The scalar loop body.
BasicBlock *LoopScalarBody;
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block.
PHINode *Induction = nullptr;
/// The induction variable of the old basic block.
PHINode *OldInduction = nullptr;
/// Maps values from the original loop to their corresponding values in the
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
VectorizerValueMap VectorLoopValueMap;
/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;
/// Trip count of the original loop.
Value *TripCount = nullptr;
/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
Value *VectorTripCount = nullptr;
/// The legality analysis.
LoopVectorizationLegality *Legal;
/// The profitablity analysis.
LoopVectorizationCostModel *Cost;
// Record whether runtime checks are added.
bool AddedSafetyChecks = false;
// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
UnrollFactor, LVL, CM) {}
private:
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
};
} // end namespace llvm
/// \brief Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
return I;
DebugLoc Empty;
if (I->getDebugLoc() != Empty)
return I;
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
if (OpInst->getDebugLoc() != Empty)
return OpInst;
}
return I;
}
void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
const DILocation *DIL = Inst->getDebugLoc();
if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
!isa<DbgInfoIntrinsic>(Inst))
B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
else
B.SetCurrentDebugLocation(DIL);
} else
B.SetCurrentDebugLocation(DebugLoc());
}
#ifndef NDEBUG
/// \return string containing a file name and a line # for the given loop.
static std::string getDebugLocString(const Loop *L) {
std::string Result;
if (L) {
raw_string_ostream OS(Result);
if (const DebugLoc LoopDbgLoc = L->getStartLoc())
LoopDbgLoc.print(OS);
else
// Just print the module name.
OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
OS.flush();
}
return Result;
}
#endif
void InnerLoopVectorizer::addNewMetadata(Instruction *To,
const Instruction *Orig) {
// If the loop was versioned with memchecks, add the corresponding no-alias
// metadata.
if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
LVer->annotateInstWithNoAlias(To, Orig);
}
void InnerLoopVectorizer::addMetadata(Instruction *To,
Instruction *From) {
propagateMetadata(To, From);
addNewMetadata(To, From);
}
void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
Instruction *From) {
for (Value *V : To) {
if (Instruction *I = dyn_cast<Instruction>(V))
addMetadata(I, From);
}
}
namespace llvm {
/// \brief The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
/// Each member in this group has an index starting from 0, and the largest
/// index should be less than interleaved factor, which is equal to the absolute
/// value of the access's stride.
///
/// E.g. An interleaved load group of factor 4:
/// for (unsigned i = 0; i < 1024; i+=4) {
/// a = A[i]; // Member of index 0
/// b = A[i+1]; // Member of index 1
/// d = A[i+3]; // Member of index 3
/// ...
/// }
///
/// An interleaved store group of factor 4:
/// for (unsigned i = 0; i < 1024; i+=4) {
/// ...
/// A[i] = a; // Member of index 0
/// A[i+1] = b; // Member of index 1
/// A[i+2] = c; // Member of index 2
/// A[i+3] = d; // Member of index 3
/// }
///
/// Note: the interleaved load group could have gaps (missing members), but
/// the interleaved store group doesn't allow gaps.
class InterleaveGroup {
public:
InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
: Align(Align), InsertPos(Instr) {
assert(Align && "The alignment should be non-zero");
Factor = std::abs(Stride);
assert(Factor > 1 && "Invalid interleave factor");
Reverse = Stride < 0;
Members[0] = Instr;
}
bool isReverse() const { return Reverse; }
unsigned getFactor() const { return Factor; }
unsigned getAlignment() const { return Align; }
unsigned getNumMembers() const { return Members.size(); }
/// \brief Try to insert a new member \p Instr with index \p Index and
/// alignment \p NewAlign. The index is related to the leader and it could be
/// negative if it is the new leader.
///
/// \returns false if the instruction doesn't belong to the group.
bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
assert(NewAlign && "The new member's alignment should be non-zero");
int Key = Index + SmallestKey;
// Skip if there is already a member with the same index.
if (Members.count(Key))
return false;
if (Key > LargestKey) {
// The largest index is always less than the interleave factor.
if (Index >= static_cast<int>(Factor))
return false;
LargestKey = Key;
} else if (Key < SmallestKey) {
// The largest index is always less than the interleave factor.
if (LargestKey - Key >= static_cast<int>(Factor))
return false;
SmallestKey = Key;
}
// It's always safe to select the minimum alignment.
Align = std::min(Align, NewAlign);
Members[Key] = Instr;
return true;
}
/// \brief Get the member with the given index \p Index
///
/// \returns nullptr if contains no such member.
Instruction *getMember(unsigned Index) const {
int Key = SmallestKey + Index;
if (!Members.count(Key))
return nullptr;
return Members.find(Key)->second;
}
/// \brief Get the index for the given member. Unlike the key in the member
/// map, the index starts from 0.
unsigned getIndex(Instruction *Instr) const {
for (auto I : Members)
if (I.second == Instr)
return I.first - SmallestKey;
llvm_unreachable("InterleaveGroup contains no such member");
}
Instruction *getInsertPos() const { return InsertPos; }
void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
/// Add metadata (e.g. alias info) from the instructions in this group to \p
/// NewInst.
///
/// FIXME: this function currently does not add noalias metadata a'la
/// addNewMedata. To do that we need to compute the intersection of the
/// noalias info from all members.
void addMetadata(Instruction *NewInst) const {
SmallVector<Value *, 4> VL;
std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
[](std::pair<int, Instruction *> p) { return p.second; });
propagateMetadata(NewInst, VL);
}
private:
unsigned Factor; // Interleave Factor.
bool Reverse;
unsigned Align;
DenseMap<int, Instruction *> Members;
int SmallestKey = 0;
int LargestKey = 0;
// To avoid breaking dependences, vectorized instructions of an interleave
// group should be inserted at either the first load or the last store in
// program order.
//
// E.g. %even = load i32 // Insert Position
// %add = add i32 %even // Use of %even
// %odd = load i32
//
// store i32 %even
// %odd = add i32 // Def of %odd
// store i32 %odd // Insert Position
Instruction *InsertPos;
};
} // end namespace llvm
namespace {
/// \brief Drive the analysis of interleaved memory accesses in the loop.
///
/// Use this class to analyze interleaved accesses only when we can vectorize
/// a loop. Otherwise it's meaningless to do analysis as the vectorization
/// on interleaved accesses is unsafe.
///
/// The analysis collects interleave groups and records the relationships
/// between the member and the group in a map.
class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
DominatorTree *DT, LoopInfo *LI)
: PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
// Avoid releasing a pointer twice.
for (auto &I : InterleaveGroupMap)
DelSet.insert(I.second);
for (auto *Ptr : DelSet)
delete Ptr;
}
/// \brief Analyze the interleaved accesses and collect them in interleave
/// groups. Substitute symbolic strides using \p Strides.
void analyzeInterleaving(const ValueToValueMap &Strides);
/// \brief Check if \p Instr belongs to any interleave group.
bool isInterleaved(Instruction *Instr) const {
return InterleaveGroupMap.count(Instr);
}
/// \brief Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
if (InterleaveGroupMap.count(Instr))
return InterleaveGroupMap.find(Instr)->second;
return nullptr;
}
/// \brief Returns true if an interleaved group that may access memory
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
/// \brief Initialize the LoopAccessInfo used for dependence checking.
void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
/// The interleaved access analysis can also add new predicates (for example
/// by versioning strides of pointers).
PredicatedScalarEvolution &PSE;
Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
const LoopAccessInfo *LAI = nullptr;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
/// out-of-bounds by executing at least one scalar epilogue iteration.
bool RequiresScalarEpilogue = false;
/// Holds the relationships between the members and the interleave group.
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
/// Holds dependences among the memory accesses in the loop. It maps a source
/// access to a set of dependent sink accesses.
DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
/// \brief The descriptor for a strided memory access.
struct StrideDescriptor {
StrideDescriptor() = default;
StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
unsigned Align)
: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
// The access's stride. It is negative for a reverse access.
int64_t Stride = 0;
// The scalar expression of this access.
const SCEV *Scev = nullptr;
// The size of the memory object.
uint64_t Size = 0;
// The alignment of this access.
unsigned Align = 0;
};
/// \brief A type for holding instructions and their stride descriptors.
using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
/// \brief Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
///
/// \returns the newly created interleave group.
InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
unsigned Align) {
assert(!InterleaveGroupMap.count(Instr) &&
"Already in an interleaved access group");
InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
return InterleaveGroupMap[Instr];
}
/// \brief Release the group and remove all the relationships.
void releaseGroup(InterleaveGroup *Group) {
for (unsigned i = 0; i < Group->getFactor(); i++)
if (Instruction *Member = Group->getMember(i))
InterleaveGroupMap.erase(Member);
delete Group;
}
/// \brief Collect all the accesses with a constant stride in program order.
void collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides);
/// \brief Returns true if \p Stride is allowed in an interleaved group.
static bool isStrided(int Stride) {
unsigned Factor = std::abs(Stride);
return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}
/// \brief Returns true if \p BB is a predicated block.
bool isPredicated(BasicBlock *BB) const {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
/// \brief Returns true if LoopAccessInfo can be used for dependence queries.
bool areDependencesValid() const {
return LAI && LAI->getDepChecker().getDependences();
}
/// \brief Returns true if memory accesses \p A and \p B can be reordered, if
/// necessary, when constructing interleaved groups.
///
/// \p A must precede \p B in program order. We return false if reordering is
/// not necessary or is prevented because \p A and \p B may be dependent.
bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
StrideEntry *B) const {
// Code motion for interleaved accesses can potentially hoist strided loads
// and sink strided stores. The code below checks the legality of the
// following two conditions:
//
// 1. Potentially moving a strided load (B) before any store (A) that
// precedes B, or
//
// 2. Potentially moving a strided store (A) after any load or store (B)
// that A precedes.
//
// It's legal to reorder A and B if we know there isn't a dependence from A
// to B. Note that this determination is conservative since some
// dependences could potentially be reordered safely.
// A is potentially the source of a dependence.
auto *Src = A->first;
auto SrcDes = A->second;
// B is potentially the sink of a dependence.
auto *Sink = B->first;
auto SinkDes = B->second;
// Code motion for interleaved accesses can't violate WAR dependences.
// Thus, reordering is legal if the source isn't a write.
if (!Src->mayWriteToMemory())
return true;
// At least one of the accesses must be strided.
if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
return true;
// If dependence information is not available from LoopAccessInfo,
// conservatively assume the instructions can't be reordered.
if (!areDependencesValid())
return false;
// If we know there is a dependence from source to sink, assume the
// instructions can't be reordered. Otherwise, reordering is legal.
return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
}
/// \brief Collect the dependences from LoopAccessInfo.
///
/// We process the dependences once during the interleaved access analysis to
/// enable constant-time dependence queries.
void collectDependences() {
if (!areDependencesValid())
return;
auto *Deps = LAI->getDepChecker().getDependences();
for (auto Dep : *Deps)
Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
}
};
/// Utility class for getting and setting loop vectorizer hints in the form
/// of loop metadata.
/// This class keeps a number of loop annotations locally (as member variables)
/// and can, upon request, write them back as metadata on the loop. It will
/// initially scan the loop for existing metadata, and will update the local
/// values based on information in the loop.
/// We cannot write all values to metadata, as the mere presence of some info,
/// for example 'force', means a decision has been made. So, we need to be
/// careful NOT to add them if the user hasn't specifically asked so.
class LoopVectorizeHints {
enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
/// Hint - associates name and validation with the hint value.
struct Hint {
const char *Name;
unsigned Value; // This may have to change for non-numeric values.
HintKind Kind;
Hint(const char *Name, unsigned Value, HintKind Kind)
: Name(Name), Value(Value), Kind(Kind) {}
bool validate(unsigned Val) {
switch (Kind) {
case HK_WIDTH:
return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
case HK_UNROLL:
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
case HK_FORCE:
return (Val <= 1);
case HK_ISVECTORIZED:
return (Val==0 || Val==1);
}
return false;
}
};
/// Vectorization width.
Hint Width;
/// Vectorization interleave factor.
Hint Interleave;
/// Vectorization forced
Hint Force;
/// Already Vectorized
Hint IsVectorized;
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
/// True if there is any unsafe math in the loop.
bool PotentiallyUnsafe = false;
public:
enum ForceKind {
FK_Undefined = -1, ///< Not selected.
FK_Disabled = 0, ///< Forcing disabled.
FK_Enabled = 1, ///< Forcing enabled.
};
LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
OptimizationRemarkEmitter &ORE)
: Width("vectorize.width", VectorizerParams::VectorizationFactor,
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
// force-vector-interleave overrides DisableInterleaving.
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
if (IsVectorized.Value != 1)
// If the vectorization width and interleaving count are both 1 then
// consider the loop to have been already vectorized because there's
// nothing more that we can do.
IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
/// Mark the loop L as already vectorized by setting the width to 1.
void setAlreadyVectorized() {
IsVectorized.Value = 1;
Hint Hints[] = {IsVectorized};
writeHintsToMetadata(Hints);
}
bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
if (getForce() == LoopVectorizeHints::FK_Disabled) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
emitRemarkWithHints();
return false;
}
if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
emitRemarkWithHints();
return false;
}
if (getIsVectorized() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
// FIXME: Add interleave.disable metadata. This will allow
// vectorize.disable to be used without disabling the pass and errors
// to differentiate between disabled vectorization and a width of 1.
ORE.emit([&]() {
return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
"AllDisabled", L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: vectorization and interleaving are "
"explicitly disabled, or the loop has already been "
"vectorized";
});
return false;
}
return true;
}
/// Dumps all the hint information.
void emitRemarkWithHints() const {
using namespace ore;
ORE.emit([&]() {
if (Force.Value == LoopVectorizeHints::FK_Disabled)
return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "loop not vectorized: vectorization is explicitly disabled";
else {
OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
TheLoop->getStartLoc(),
TheLoop->getHeader());
R << "loop not vectorized";
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
R << " (Force=" << NV("Force", true);
if (Width.Value != 0)
R << ", Vector Width=" << NV("VectorWidth", Width.Value);
if (Interleave.Value != 0)
R << ", Interleave Count="
<< NV("InterleaveCount", Interleave.Value);
R << ")";
}
return R;
}
});
}
unsigned getWidth() const { return Width.Value; }
unsigned getInterleave() const { return Interleave.Value; }
unsigned getIsVectorized() const { return IsVectorized.Value; }
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
/// \brief If hints are provided that force vectorization, use the AlwaysPrint
/// pass name to force the frontend to print the diagnostic.
const char *vectorizeAnalysisPassName() const {
if (getWidth() == 1)
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Disabled)
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
return LV_NAME;
return OptimizationRemarkAnalysis::AlwaysPrint;
}
bool allowReordering() const {
// When enabling loop hints are provided we allow the vectorizer to change
// the order of operations that is given by the scalar loop. This is not
// enabled by default because can be unsafe or inefficient. For example,
// reordering floating-point operations will change the way round-off
// error accumulates in the loop.
return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
}
bool isPotentiallyUnsafe() const {
// Avoid FP vectorization if the target is unsure about proper support.
// This may be related to the SIMD unit in the target not handling
// IEEE 754 FP ops properly, or bad single-to-double promotions.
// Otherwise, a sequence of vectorized loops, even without reduction,
// could lead to different end results on the destination vectors.
return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
}
void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
private:
/// Find hints specified in the loop metadata and update local values.
void getHintsFromMetadata() {
MDNode *LoopID = TheLoop->getLoopID();
if (!LoopID)
return;
// First operand should refer to the loop id itself.
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
const MDString *S = nullptr;
SmallVector<Metadata *, 4> Args;
// The expected hint is either a MDString or a MDNode with the first
// operand a MDString.
if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
if (!MD || MD->getNumOperands() == 0)
continue;
S = dyn_cast<MDString>(MD->getOperand(0));
for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
Args.push_back(MD->getOperand(i));
} else {
S = dyn_cast<MDString>(LoopID->getOperand(i));
assert(Args.size() == 0 && "too many arguments for MDString");
}
if (!S)
continue;
// Check if the hint starts with the loop metadata prefix.
StringRef Name = S->getString();
if (Args.size() == 1)
setHint(Name, Args[0]);
}
}
/// Checks string hint with one operand and set value if valid.
void setHint(StringRef Name, Metadata *Arg) {
if (!Name.startswith(Prefix()))
return;
Name = Name.substr(Prefix().size(), StringRef::npos);
const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
if (!C)
return;
unsigned Val = C->getZExtValue();
Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
H->Value = Val;
else
DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
break;
}
}
}
/// Create a new hint from name / value pair.
MDNode *createHintMetadata(StringRef Name, unsigned V) const {
LLVMContext &Context = TheLoop->getHeader()->getContext();
Metadata *MDs[] = {MDString::get(Context, Name),
ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Context), V))};
return MDNode::get(Context, MDs);
}
/// Matches metadata with hint name.
bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
if (!Name)
return false;
for (auto H : HintTypes)
if (Name->getString().endswith(H.Name))
return true;
return false;
}
/// Sets current hints into loop metadata, keeping other values intact.
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
if (HintTypes.empty())
return;
// Reserve the first element to LoopID (see below).
SmallVector<Metadata *, 4> MDs(1);
// If the loop already has metadata, then ignore the existing operands.
MDNode *LoopID = TheLoop->getLoopID();
if (LoopID) {
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
// If node in update list, ignore old value.
if (!matchesHintMetadataName(Node, HintTypes))
MDs.push_back(Node);
}
}
// Now, add the missing hints.
for (auto H : HintTypes)
MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
// Replace current metadata node with new one.
LLVMContext &Context = TheLoop->getHeader()->getContext();
MDNode *NewLoopID = MDNode::get(Context, MDs);
// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);
TheLoop->setLoopID(NewLoopID);
}
/// The loop these hints belong to.
const Loop *TheLoop;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
};
} // end anonymous namespace
static void emitMissedWarning(Function *F, Loop *L,
const LoopVectorizeHints &LH,
OptimizationRemarkEmitter *ORE) {
LH.emitRemarkWithHints();
if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
if (LH.getWidth() != 1)
ORE->emit(DiagnosticInfoOptimizationFailure(
DEBUG_TYPE, "FailedRequestedVectorization",
L->getStartLoc(), L->getHeader())
<< "loop not vectorized: "
<< "failed explicitly specified loop vectorization");
else if (LH.getInterleave() != 1)
ORE->emit(DiagnosticInfoOptimizationFailure(
DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
L->getHeader())
<< "loop not interleaved: "
<< "failed explicitly specified loop interleaving");
}
}
namespace {
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
/// legality. This class has two main kinds of checks:
/// * Memory checks - The code in canVectorizeMemory checks if vectorization
/// will change the order of memory accesses in a way that will change the
/// correctness of the program.
/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
/// checks for a number of different conditions, such as the availability of a
/// single induction variable, that all types are supported and vectorize-able,
/// etc. This code reflects the capabilities of InnerLoopVectorizer.
/// This class is also used by InnerLoopVectorizer for identifying
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
const TargetTransformInfo *TTI,
std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
: TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H),
DB(DB), AC(AC) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
/// InductionList saves induction variables and maps them to the
/// induction descriptor.
using InductionList = MapVector<PHINode *, InductionDescriptor>;
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
bool canVectorize();
/// Returns the primary induction variable.
PHINode *getPrimaryInduction() { return PrimaryInduction; }
/// Returns the reduction variables found in the loop.
ReductionList *getReductionVars() { return &Reductions; }
/// Returns the induction variables found in the loop.
InductionList *getInductionVars() { return &Inductions; }
/// Return the first-order recurrences found in the loop.
RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
/// Return the set of instructions to sink to handle first-order recurrences.
DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
/// Returns the widest induction type.
Type *getWidestInductionType() { return WidestIndTy; }
/// Returns True if V is a Phi node of an induction variable in this loop.
bool isInductionPhi(const Value *V);
/// Returns True if V is a cast that is part of an induction def-use chain,
/// and had been proven to be redundant under a runtime guard (in other
/// words, the cast has the same SCEV expression as the induction phi).
bool isCastedInductionVariable(const Value *V);
/// Returns True if V can be considered as an induction variable in this
/// loop. V can be the induction phi, or some redundant cast in the def-use
/// chain of the inducion phi.
bool isInductionVariable(const Value *V);
/// Returns True if PN is a reduction variable in this loop.
bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
/// Returns True if Phi is a first-order recurrence in this loop.
bool isFirstOrderRecurrence(const PHINode *Phi);
/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication(BasicBlock *BB);
/// Check if this pointer is consecutive when vectorizing. This happens
/// when the last index of the GEP is the induction variable, or that the
/// pointer itself is an induction variable.
/// This check allows us to vectorize A[idx] into a wide load/store.
/// Returns:
/// 0 - Stride is unknown or non-consecutive.
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
/// NOTE: This method must only be used before modifying the original scalar
/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
int isConsecutivePtr(Value *Ptr);
/// Returns true if the value V is uniform within the loop.
bool isUniform(Value *V);
/// Returns the information that we collected about runtime memory check.
const RuntimePointerChecking *getRuntimePointerChecking() const {
return LAI->getRuntimePointerChecking();
}
const LoopAccessInfo *getLAI() const { return LAI; }
/// \brief Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
return InterleaveInfo.isInterleaved(Instr);
}
/// \brief Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
return InterleaveInfo.getInterleaveGroup(Instr);
}
/// \brief Returns true if an interleaved group requires a scalar iteration
/// to handle accesses with gaps.
bool requiresScalarEpilogue() const {
return InterleaveInfo.requiresScalarEpilogue();
}
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
uint64_t getMaxSafeRegisterWidth() const {
return LAI->getDepChecker().getMaxSafeRegisterWidth();
}
bool hasStride(Value *V) { return LAI->hasStride(V); }
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType) {
return TTI->isLegalMaskedScatter(DataType);
}
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType) {
return TTI->isLegalMaskedGather(DataType);
}
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
auto *LI = dyn_cast<LoadInst>(V);
auto *SI = dyn_cast<StoreInst>(V);
if (!LI && !SI)
return false;
auto *Ptr = getPointerOperand(V);
auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
}
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
unsigned getNumPredStores() const { return NumPredStores; }
/// Returns true if \p I is an instruction that will be scalarized with
/// predication. Such instructions include conditional stores and
/// instructions that may divide by zero.
bool isScalarWithPredication(Instruction *I);
/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
// Returns true if the NoNaN attribute is set on the function.
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
/// and we only need to check individual instructions.
bool canVectorizeInstrs();
/// When we vectorize loops we may change the order in which
/// we read and write from memory. This method checks if it is
/// legal to vectorize the code, considering only memory constrains.
/// Returns true if the loop is vectorizable
bool canVectorizeMemory();
/// Return true if we can vectorize this loop using the IF-conversion
/// transformation.
bool canVectorizeWithIfConvert();
/// Return true if all of the instructions in the block can be speculatively
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
/// Updates the vectorization state by adding \p Phi to the inductions list.
/// This can set \p Phi as the main induction of the loop if \p Phi is a
/// better choice for the main induction than the existing one.
void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit);
/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark. If \p I is passed it is
/// an instruction that prevents vectorization. Otherwise the loop is used
/// for the location of the remark. \return the remark object that can be
/// streamed to.
OptimizationRemarkAnalysis
createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
RemarkName, TheLoop, I);
}
/// \brief If an access has a symbolic strides, this maps the pointer value to
/// the stride symbol.
const ValueToValueMap *getSymbolicStrides() {
// FIXME: Currently, the set of symbolic strides is sometimes queried before
// it's collected. This happens from canVectorizeWithIfConvert, when the
// pointer is checked to reference consecutive elements suitable for a
// masked access.
return LAI ? &LAI->getSymbolicStrides() : nullptr;
}
unsigned NumPredStores = 0;
/// The loop that we evaluate.
Loop *TheLoop;
/// A wrapper around ScalarEvolution used to add runtime SCEV checks.
/// Applies dynamic knowledge to simplify SCEV expressions in the context
/// of existing SCEV assumptions. The analysis will also add a minimal set
/// of new predicates if this is required to enable vectorization and
/// unrolling.
PredicatedScalarEvolution &PSE;
/// Target Library Info.
TargetLibraryInfo *TLI;
/// Target Transform Info
const TargetTransformInfo *TTI;
/// Dominator Tree.
DominatorTree *DT;
// LoopAccess analysis.
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
// And the loop-accesses info corresponding to this loop. This pointer is
// null until canVectorizeMemory sets it up.
const LoopAccessInfo *LAI = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
InterleavedAccessInfo InterleaveInfo;
// --- vectorization state --- //
/// Holds the primary induction variable. This is the counter of the
/// loop.
PHINode *PrimaryInduction = nullptr;
/// Holds the reduction variables.
ReductionList Reductions;
/// Holds all of the induction variables that we found in the loop.
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions;
/// Holds all the casts that participate in the update chain of the induction
/// variables, and that have been proven to be redundant (possibly under a
/// runtime guard). These casts can be ignored when creating the vectorized
/// loop body.
SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
/// Holds the phi nodes that are first-order recurrences.
RecurrenceSet FirstOrderRecurrences;
/// Holds instructions that need to sink past other instructions to handle
/// first-order recurrences.
DenseMap<Instruction *, Instruction *> SinkAfter;
/// Holds the widest induction type encountered.
Type *WidestIndTy = nullptr;
/// Allowed outside users. This holds the induction and reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value *, 4> AllowedExit;
/// Can we assume the absence of NaNs.
bool HasFunNoNaNAttr = false;
/// Vectorization requirements that will go through late-evaluation.
LoopVectorizationRequirements *Requirements;
/// Used to emit an analysis of any legality issues.
LoopVectorizeHints *Hints;
/// The demanded bits analsyis is used to compute the minimum type size in
/// which a reduction can be computed.
DemandedBits *DB;
/// The assumption cache analysis is used to compute the minimum type size in
/// which a reduction can be computed.
AssumptionCache *AC;
/// While vectorizing these instructions we have to generate a
/// call to the appropriate masked intrinsic
SmallPtrSet<const Instruction *, 8> MaskedOp;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
/// a number of reasons. In this class we mainly attempt to predict the
/// expected speedup/slowdowns due to the supported instruction set. We use the
/// TargetTransformInfo to query the different backends for the cost of
/// different operations.
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints)
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization should be avoided up front.
Optional<unsigned> computeMaxVF(bool OptForSize);
/// Information about vectorization costs
struct VectorizationFactor {
// Vector width with best cost
unsigned Width;
// Cost of the loop with that width
unsigned Cost;
};
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
/// Setup cost-based decisions for user vectorization factor.
void selectUserVectorizationFactor(unsigned UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
}
/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
/// This function takes cost-based decisions for Load/Store instructions
/// and collects them in a map. This decisions map is used for building
/// the lists of loop-uniform and loop-scalar instructions.
/// The calculated cost is saved with widening decision in order to
/// avoid redundant calculations.
void setCostBasedWideningDecision(unsigned VF);
/// \brief A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
unsigned LoopInvariantRegs;
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers;
/// Holds the number of instructions in the loop.
unsigned NumInstructions;
};
/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
return MinBWs;
}
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
return Scalars->second.count(I);
}
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
if (VF == 1)
return true;
assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
auto UniformsPerVF = Uniforms.find(VF);
return UniformsPerVF->second.count(I);
}
/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
if (VF == 1)
return true;
assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
auto ScalarsPerVF = Scalars.find(VF);
return ScalarsPerVF->second.count(I);
}
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
/// Decision that was taken during cost calculation for memory instruction.
enum InstWidening {
CM_Unknown,
CM_Widen, // For consecutive accesses with stride +1.
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
CM_Scalarize
};
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
unsigned Cost) {
assert(VF >= 2 && "Expected VF >=2");
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
InstWidening W, unsigned Cost) {
assert(VF >= 2 && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
/// But the cost will be assigned to one instruction only.
for (unsigned i = 0; i < Grp->getFactor(); ++i) {
if (auto *I = Grp->getMember(i)) {
if (Grp->getInsertPos() == I)
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
else
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
}
}
}
/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, unsigned VF) {
assert(VF >= 2 && "Expected VF >=2");
std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
auto Itr = WideningDecisions.find(InstOnVF);
if (Itr == WideningDecisions.end())
return CM_Unknown;
return Itr->second.first;
}
/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
unsigned getWideningCost(Instruction *I, unsigned VF) {
assert(VF >= 2 && "Expected VF >=2");
std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
return WideningDecisions[InstOnVF].second;
}
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
return false;
// Get the source and destination types of the truncate.
Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
// If the truncate is free for the given types, return false. Replacing a
// free truncate with an induction variable would add an induction variable
// update instruction to each iteration of the loop. We exclude from this
// check the primary induction variable since it will need an update
// instruction regardless.
Value *Op = Trunc->getOperand(0);
if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
return false;
// If the truncated value is not an induction variable, return false.
return Legal->isInductionPhi(Op);
}
/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
void collectInstsToScalarize(unsigned VF);
/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(unsigned VF) {
// Do the analysis once.
if (VF == 1 || Uniforms.count(VF))
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
collectLoopScalars(VF);
}
private:
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
/// vector values after type legalization in the backend. If this latter value
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
using VectorizationCostTy = std::pair<unsigned, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
VectorizationCostTy expectedCost(unsigned VF);
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
/// Calculate vectorization cost of memory instruction \p I.
unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
/// The cost computation for scalarized memory instruction.
unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
/// The cost computation for interleaving group of memory instructions.
unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
/// The cost computation for Gather/Scatter instruction.
unsigned getGatherScatterCost(Instruction *I, unsigned VF);
/// The cost computation for widening instruction \p I with consecutive
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
/// The cost calculation for Load instruction \p I with uniform pointer -
/// scalar load + broadcast.
unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark. \return the remark object
/// that can be streamed to.
OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
RemarkName, TheLoop);
}
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;
/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
/// Holds the instructions known to be scalar after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
/// Holds the instructions (address computations) that are forced to be
/// scalarized.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
unsigned VF);
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
/// uniform instructions include pointer operands of consecutive or
/// interleaved memory accesses. Note that although uniformity implies an
/// instruction will be scalar, the reverse is not true. In general, a
/// scalarized instruction will be represented by VF scalar values in the
/// vectorized loop, each corresponding to an iteration of the original
/// scalar loop.
void collectLoopUniforms(unsigned VF);
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
std::pair<InstWidening, unsigned>>;
DecisionList WideningDecisions;
public:
/// The loop that we evaluate.
Loop *TheLoop;
/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;
/// Loop Info analysis.
LoopInfo *LI;
/// Vectorization legality.
LoopVectorizationLegality *Legal;
/// Vector target information.
const TargetTransformInfo &TTI;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// Demanded bits analysis.
DemandedBits *DB;
/// Assumption cache.
AssumptionCache *AC;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
const Function *TheFunction;
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
} // end anonymous namespace
namespace llvm {
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// LoopVectorizationPlanner - drives the vectorization process after having
/// passed Legality checks.
/// The planner builds and optimizes the Vectorization Plans which record the
/// decisions how to vectorize the given loop. In particular, represent the
/// control-flow of the vectorized version, the replication of instructions that
/// are to be scalarized, and interleave access groups.
class LoopVectorizationPlanner {
/// The loop that we evaluate.
Loop *OrigLoop;
/// Loop Info analysis.
LoopInfo *LI;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// Target Transform Info.
const TargetTransformInfo *TTI;
/// The legality analysis.
LoopVectorizationLegality *Legal;
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
using VPlanPtr = std::unique_ptr<VPlan>;
SmallVector<VPlanPtr, 4> VPlans;
/// This class is used to enable the VPlan to invoke a method of ILV. This is
/// needed until the method is refactored out of ILV and becomes reusable.
struct VPCallbackILV : public VPCallback {
InnerLoopVectorizer &ILV;
VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
return ILV.getOrCreateVectorValue(V, Part);
}
};
/// A builder used to construct the current plan.
VPBuilder Builder;
/// When we if-convert we need to create edge masks. We have to cache values
/// so that we don't end up with exponential recursion/IR. Note that
/// if-conversion currently takes place during VPlan-construction, so these
/// caches are only used at that stage.
using EdgeMaskCacheTy =
DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
EdgeMaskCacheTy EdgeMaskCache;
BlockMaskCacheTy BlockMaskCache;
unsigned BestVF = 0;
unsigned BestUF = 0;
public:
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM)
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
/// Plan how to best vectorize, return the best VF and its cost.
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
/// Finalize the best decision and dispose of all other VPlans.
void setBestPlan(unsigned VF, unsigned UF);
/// Generate the IR code for the body of the vectorized loop according to the
/// best selected VPlan.
void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
void printPlans(raw_ostream &O) {
for (const auto &Plan : VPlans)
O << *Plan;
}
protected:
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
void collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions);
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 9) = {1, 2, 4, 8}
struct VFRange {
// A power of 2.
const unsigned Start;
// Need not be a power of 2. If End <= Start range is empty.
unsigned End;
};
/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
/// returned value holds for the entire \p Range.
bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
VFRange &Range);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
void buildVPlans(unsigned MinVF, unsigned MaxVF);
private:
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
/// Check if \I belongs to an Interleave Group within the given VF \p Range,
/// \return true in the first returned value if so and false otherwise.
/// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
/// for \p Range.Start, and provide it as the second returned value.
/// Note that if \I is an adjunct member of an IG for \p Range.Start, the
/// \return value is <true, nullptr>, as it is handled by another recipe.
/// \p Range.End may be decreased to ensure same decision from \p Range.Start
/// to \p Range.End.
VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
// Check if \I is a memory instruction to be widened for \p Range.Start and
// potentially masked. Such instructions are handled by a recipe that takes an
// additional VPInstruction for the mask.
VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
VFRange &Range,
VPlanPtr &Plan);
/// Check if an induction recipe should be constructed for \I within the given
/// VF \p Range. If so build and return it. If not, return null. \p Range.End
/// may be decreased to ensure same decision from \p Range.Start to
/// \p Range.End.
VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
VFRange &Range);
/// Handle non-loop phi nodes. Currently all such phi nodes are turned into
/// a sequence of select instructions as the vectorizer currently performs
/// full if-conversion.
VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
/// Check if \p I can be widened within the given VF \p Range. If \p I can be
/// widened for \p Range.Start, check if the last recipe of \p VPBB can be
/// extended to include \p I or else build a new VPWidenRecipe for it and
/// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
/// false otherwise. Range.End may be decreased to ensure same decision from
/// \p Range.Start to \p Range.End.
bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
/// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
/// is predicated. \return \p VPBB augmented with this new recipe if \p I is
/// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
/// Region. Update the packing decision of predicated instructions if they
/// feed \p I. Range.End may be decreased to ensure same recipe behavior from
/// \p Range.Start to \p Range.End.
VPBasicBlock *handleReplication(
Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan);
/// Create a replicating region for instruction \p I that requires
/// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
VPlanPtr &Plan);
/// Build a VPlan according to the information gathered by Legal. \return a
/// VPlan for vectorization factors \p Range.Start and up to \p Range.End
/// exclusive, possibly decreasing \p Range.End.
VPlanPtr buildVPlan(VFRange &Range,
const SmallPtrSetImpl<Value *> &NeedDef);
};
} // end namespace llvm
namespace {
/// \brief This holds vectorization requirements that must be verified late in
/// the process. The requirements are set by legalize and costmodel. Once
/// vectorization has been determined to be possible and profitable the
/// requirements can be verified by looking for metadata or compiler options.
/// For example, some loops require FP commutativity which is only allowed if
/// vectorization is explicitly specified or if the fast-math compiler option
/// has been provided.
/// Late evaluation of these requirements allows helpful diagnostics to be
/// composed that tells the user what need to be done to vectorize the loop. For
/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
/// evaluation should be used only when diagnostics can generated that can be
/// followed by a non-expert user.
class LoopVectorizationRequirements {
public:
LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
void addUnsafeAlgebraInst(Instruction *I) {
// First unsafe algebra instruction.
if (!UnsafeAlgebraInst)
UnsafeAlgebraInst = I;
}
void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
const char *PassName = Hints.vectorizeAnalysisPassName();
bool Failed = false;
if (UnsafeAlgebraInst && !Hints.allowReordering()) {
ORE.emit([&]() {
return OptimizationRemarkAnalysisFPCommute(
PassName, "CantReorderFPOps",
UnsafeAlgebraInst->getDebugLoc(),
UnsafeAlgebraInst->getParent())
<< "loop not vectorized: cannot prove it is safe to reorder "
"floating-point operations";
});
Failed = true;
}
// Test if runtime memcheck thresholds are exceeded.
bool PragmaThresholdReached =
NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
bool ThresholdReached =
NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
if ((ThresholdReached && !Hints.allowReordering()) ||
PragmaThresholdReached) {
ORE.emit([&]() {
return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
L->getStartLoc(),
L->getHeader())
<< "loop not vectorized: cannot prove it is safe to reorder "
"memory operations";
});
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
return Failed;
}
private:
unsigned NumRuntimePointerChecks = 0;
Instruction *UnsafeAlgebraInst = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
};
} // end anonymous namespace
static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty()) {
if (!hasCyclesInLoopBody(L))
V.push_back(&L);
return;
}
for (Loop *InnerL : L)
addAcyclicInnerLoop(*InnerL, V);
}
namespace {
/// The LoopVectorize Pass.
struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;
LoopVectorizePass Impl;
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
: FunctionPass(ID) {
Impl.DisableUnrolling = NoUnrolling;
Impl.AlwaysVectorize = AlwaysVectorize;
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
GetLAA, *ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessLegacyAnalysis>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
} // end anonymous namespace
//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// We need to place the broadcast of invariant variables outside the loop.
Instruction *Instr = dyn_cast<Instruction>(V);
bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
return Shuf;
}
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
Value *Start = II.getStartValue();
// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
if (isa<TruncInst>(EntryVal)) {
assert(Start->getType()->isIntegerTy() &&
"Truncation requires an integer type");
auto *TruncType = cast<IntegerType>(EntryVal->getType());
Step = Builder.CreateTrunc(Step, TruncType);
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
Value *SteppedStart =
getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (Step->getType()->isIntegerTy()) {
AddOp = Instruction::Add;
MulOp = Instruction::Mul;
} else {
AddOp = II.getInductionOpcode();
MulOp = Instruction::FMul;
}
// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
// Create a vector splat to use in the induction update.
//
// FIXME: If the step is non-constant, we create the vector splat with
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
Value *SplatVF = isa<Constant>(Mul)
? ConstantVector::getSplat(VF, cast<Constant>(Mul))
: Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
&*LoopVectorBody->getFirstInsertionPt());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
else
recordVectorLoopValueForInductionCast(II, LastInduction, Part);
LastInduction = cast<Instruction>(addFastMathFlag(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
}
// Move the last step to the end of the latch block. This ensures consistent
// placement of all induction updates.
auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
auto *ICmp = cast<Instruction>(Br->getCondition());
LastInduction->moveBefore(ICmp);
LastInduction->setName("vec.ind.next");
VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
VecInd->addIncoming(LastInduction, LoopVectorLatch);
}
bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
return Cost->isScalarAfterVectorization(I, VF) ||
Cost->isProfitableToScalarize(I, VF);
}
bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
if (shouldScalarizeInstruction(IV))
return true;
auto isScalarInst = [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
return llvm::any_of(IV->users(), isScalarInst);
}
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
unsigned Lane) {
const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
if (Casts.empty())
return;
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
Instruction *CastInst = *Casts.begin();
if (Lane < UINT_MAX)
VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
else
VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
}
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
auto II = Legal->getInductionVars()->find(IV);
assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
// The scalar value to broadcast. This will be derived from the canonical
// induction variable.
Value *ScalarIV = nullptr;
// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
// True if we have vectorized the induction variable.
auto VectorizedIV = false;
// Determine if we want a scalar version of the induction variable. This is
// true if the induction variable itself is not widened, or if it has at
// least one user in the loop that is not widened.
auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
"Induction step should be loop invariant");
auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
Value *Step = nullptr;
if (PSE.getSE()->isSCEVable(IV->getType())) {
SCEVExpander Exp(*PSE.getSE(), DL, "induction");
Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
LoopVectorPreHeader->getTerminator());
} else {
Step = cast<SCEVUnknown>(ID.getStep())->getValue();
}
// Try to create a new independent vector induction variable. If we can't
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
VectorizedIV = true;
}
// If we haven't yet vectorized the induction variable, or if we will create
// a scalar one, we need to define the scalar induction variable and step
// values. If we were given a truncation type, truncate the canonical
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
if (!VectorizedIV || NeedsScalarIV) {
ScalarIV = Induction;
if (IV != OldInduction) {
ScalarIV = IV->getType()->isIntegerTy()
? Builder.CreateSExtOrTrunc(Induction, IV->getType())
: Builder.CreateCast(Instruction::SIToFP, Induction,
IV->getType());
ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
ScalarIV->setName("offset.idx");
}
if (Trunc) {
auto *TruncType = cast<IntegerType>(Trunc->getType());
assert(Step->getType()->isIntegerTy() &&
"Truncation requires an integer step");
ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
Step = Builder.CreateTrunc(Step, TruncType);
}
}
// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
// TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
if (Trunc)
addMetadata(EntryPart, Trunc);
else
recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
}
}
// If an induction variable is only used for counting loop iterations or
// calculating addresses, it doesn't need to be widened. Create scalar steps
// that can be used by instructions we will later scalarize. Note that the
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
if (NeedsScalarIV)
buildScalarSteps(ScalarIV, Step, EntryVal, ID);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
assert(Val->getType()->isVectorTy() && "Must be a vector");
int VLen = Val->getType()->getVectorNumElements();
Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP");
assert(Step->getType() == STy && "Step has wrong type");
SmallVector<Constant *, 8> Indices;
if (STy->isIntegerTy()) {
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantInt::get(STy, StartIdx + i));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
Step = Builder.CreateMul(Cv, Step);
return Builder.CreateAdd(Val, Step, "induction");
}
// Floating point induction.
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
Step = Builder.CreateVectorSplat(VLen, Step);
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
Flags.setFast();
Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
// Have to check, MulOp may be a constant
cast<Instruction>(MulOp)->setFastMathFlags(Flags);
Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
if (isa<Instruction>(BOp))
cast<Instruction>(BOp)->setFastMathFlags(Flags);
return BOp;
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Value *EntryVal,
const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&
"Val and Step should have the same type");
// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (ScalarIVTy->isIntegerTy()) {
AddOp = Instruction::Add;
MulOp = Instruction::Mul;
} else {
AddOp = ID.getInductionOpcode();
MulOp = Instruction::FMul;
}
// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
: VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
}
}
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
ValueToValueMap();
int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
if (Stride == 1 || Stride == -1)
return Stride;
return 0;
}
bool LoopVectorizationLegality::isUniform(Value *V) {
return LAI->isUniform(V);
}
Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {