| //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
| // and generates target-independent LLVM-IR. |
| // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
| // of instructions in order to estimate the profitability of vectorization. |
| // |
| // The loop vectorizer combines consecutive loop iterations into a single |
| // 'wide' iteration. After this transformation the index is incremented |
| // by the SIMD vector width, and not by one. |
| // |
| // This pass has three parts: |
| // 1. The main loop pass that drives the different parts. |
| // 2. LoopVectorizationLegality - A unit that checks for the legality |
| // of the vectorization. |
| // 3. InnerLoopVectorizer - A unit that performs the actual |
| // widening of instructions. |
| // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
| // of vectorization. It decides on the optimal vector width, which |
| // can be one, if vectorization is not profitable. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // The reduction-variable vectorization is based on the paper: |
| // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
| // |
| // Variable uniformity checks are inspired by: |
| // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
| // |
| // The interleaved access vectorization is based on the paper: |
| // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
| // Data for SIMD |
| // |
| // Other ideas/concepts are from: |
| // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
| // |
| // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
| // Vectorizing Compilers. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| #include "VPlan.h" |
| #include "VPlanBuilder.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseMapInfo.h" |
| #include "llvm/ADT/Hashing.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/None.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/SCCIterator.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AssumptionCache.h" |
| #include "llvm/Analysis/BasicAliasAnalysis.h" |
| #include "llvm/Analysis/BlockFrequencyInfo.h" |
| #include "llvm/Analysis/CodeMetrics.h" |
| #include "llvm/Analysis/DemandedBits.h" |
| #include "llvm/Analysis/GlobalsModRef.h" |
| #include "llvm/Analysis/LoopAccessAnalysis.h" |
| #include "llvm/Analysis/LoopAnalysisManager.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/LoopIterator.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpander.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/TargetLibraryInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/CFG.h" |
| #include "llvm/IR/Constant.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugInfoMetadata.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/LLVMContext.h" |
| #include "llvm/IR/Metadata.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| #include "llvm/Transforms/Utils/LoopSimplify.h" |
| #include "llvm/Transforms/Utils/LoopUtils.h" |
| #include "llvm/Transforms/Utils/LoopVersioning.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <functional> |
| #include <iterator> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| using namespace llvm; |
| |
| #define LV_NAME "loop-vectorize" |
| #define DEBUG_TYPE LV_NAME |
| |
| STATISTIC(LoopsVectorized, "Number of loops vectorized"); |
| STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); |
| |
| static cl::opt<bool> |
| EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, |
| cl::desc("Enable if-conversion during vectorization.")); |
| |
| /// Loops with a known constant trip count below this number are vectorized only |
| /// if no scalar iteration overheads are incurred. |
| static cl::opt<unsigned> TinyTripCountVectorThreshold( |
| "vectorizer-min-trip-count", cl::init(16), cl::Hidden, |
| cl::desc("Loops with a constant trip count that is smaller than this " |
| "value are vectorized only if no scalar iteration overheads " |
| "are incurred.")); |
| |
| static cl::opt<bool> MaximizeBandwidth( |
| "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
| cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| "will be determined by the smallest type in loop.")); |
| |
| static cl::opt<bool> EnableInterleavedMemAccesses( |
| "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on interleaved memory accesses in a loop")); |
| |
| /// Maximum factor for an interleaved memory access. |
| static cl::opt<unsigned> MaxInterleaveGroupFactor( |
| "max-interleave-group-factor", cl::Hidden, |
| cl::desc("Maximum factor for an interleaved access group (default = 8)"), |
| cl::init(8)); |
| |
| /// We don't interleave loops with a known constant trip count below this |
| /// number. |
| static const unsigned TinyTripCountInterleaveThreshold = 128; |
| |
| static cl::opt<unsigned> ForceTargetNumScalarRegs( |
| "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of scalar registers.")); |
| |
| static cl::opt<unsigned> ForceTargetNumVectorRegs( |
| "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of vector registers.")); |
| |
| /// Maximum vectorization interleave count. |
| static const unsigned MaxInterleaveFactor = 16; |
| |
| static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
| "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "scalar loops.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
| "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "vectorized loops.")); |
| |
| static cl::opt<unsigned> ForceTargetInstructionCost( |
| "force-target-instruction-cost", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's expected cost for " |
| "an instruction to a single constant value. Mostly " |
| "useful for getting consistent testing.")); |
| |
| static cl::opt<unsigned> SmallLoopCost( |
| "small-loop-cost", cl::init(20), cl::Hidden, |
| cl::desc( |
| "The cost of a loop that is considered 'small' by the interleaver.")); |
| |
| static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
| "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, |
| cl::desc("Enable the use of the block frequency analysis to access PGO " |
| "heuristics minimizing code growth in cold regions and being more " |
| "aggressive in hot regions.")); |
| |
| // Runtime interleave loops for load/store throughput. |
| static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
| "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, |
| cl::desc( |
| "Enable runtime interleaving until load/store ports are saturated")); |
| |
| /// The number of stores in a loop that are allowed to need predication. |
| static cl::opt<unsigned> NumberOfStoresToPredicate( |
| "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
| cl::desc("Max number of stores to be predicated behind an if.")); |
| |
| static cl::opt<bool> EnableIndVarRegisterHeur( |
| "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
| cl::desc("Count the induction variable only once when interleaving")); |
| |
| static cl::opt<bool> EnableCondStoresVectorization( |
| "enable-cond-stores-vec", cl::init(true), cl::Hidden, |
| cl::desc("Enable if predication of stores during vectorization.")); |
| |
| static cl::opt<unsigned> MaxNestedScalarReductionIC( |
| "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, |
| cl::desc("The maximum interleave count to use when interleaving a scalar " |
| "reduction in a nested loop.")); |
| |
| static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( |
| "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, |
| cl::desc("The maximum allowed number of runtime memory checks with a " |
| "vectorize(enable) pragma.")); |
| |
| static cl::opt<unsigned> VectorizeSCEVCheckThreshold( |
| "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, |
| cl::desc("The maximum number of SCEV checks allowed.")); |
| |
| static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( |
| "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, |
| cl::desc("The maximum number of SCEV checks allowed with a " |
| "vectorize(enable) pragma")); |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
| /// RemarkName is the identifier for the remark. If \p I is passed it is an |
| /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| /// the location of the remark. \return the remark object that can be |
| /// streamed to. |
| static OptimizationRemarkAnalysis |
| createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, |
| Instruction *I = nullptr) { |
| Value *CodeRegion = TheLoop->getHeader(); |
| DebugLoc DL = TheLoop->getStartLoc(); |
| |
| if (I) { |
| CodeRegion = I->getParent(); |
| // If there is no debug location attached to the instruction, revert back to |
| // using the loop's. |
| if (I->getDebugLoc()) |
| DL = I->getDebugLoc(); |
| } |
| |
| OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); |
| R << "loop not vectorized: "; |
| return R; |
| } |
| |
| namespace { |
| |
| class LoopVectorizationLegality; |
| class LoopVectorizationCostModel; |
| class LoopVectorizationRequirements; |
| |
| } // end anonymous namespace |
| |
| /// Returns true if the given loop body has a cycle, excluding the loop |
| /// itself. |
| static bool hasCyclesInLoopBody(const Loop &L) { |
| if (!L.empty()) |
| return true; |
| |
| for (const auto &SCC : |
| make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), |
| scc_iterator<Loop, LoopBodyTraits>::end(L))) { |
| if (SCC.size() > 1) { |
| DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); |
| DEBUG(L.dump()); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// A helper function for converting Scalar types to vector types. |
| /// If the incoming type is void, we return void. If the VF is 1, we return |
| /// the scalar type. |
| static Type *ToVectorTy(Type *Scalar, unsigned VF) { |
| if (Scalar->isVoidTy() || VF == 1) |
| return Scalar; |
| return VectorType::get(Scalar, VF); |
| } |
| |
| // FIXME: The following helper functions have multiple implementations |
| // in the project. They can be effectively organized in a common Load/Store |
| // utilities unit. |
| |
| /// A helper function that returns the pointer operand of a load or store |
| /// instruction. |
| static Value *getPointerOperand(Value *I) { |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getPointerOperand(); |
| if (auto *SI = dyn_cast<StoreInst>(I)) |
| return SI->getPointerOperand(); |
| return nullptr; |
| } |
| |
| /// A helper function that returns the type of loaded or stored value. |
| static Type *getMemInstValueType(Value *I) { |
| assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
| "Expected Load or Store instruction"); |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getType(); |
| return cast<StoreInst>(I)->getValueOperand()->getType(); |
| } |
| |
| /// A helper function that returns the alignment of load or store instruction. |
| static unsigned getMemInstAlignment(Value *I) { |
| assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
| "Expected Load or Store instruction"); |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getAlignment(); |
| return cast<StoreInst>(I)->getAlignment(); |
| } |
| |
| /// A helper function that returns the address space of the pointer operand of |
| /// load or store instruction. |
| static unsigned getMemInstAddressSpace(Value *I) { |
| assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
| "Expected Load or Store instruction"); |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getPointerAddressSpace(); |
| return cast<StoreInst>(I)->getPointerAddressSpace(); |
| } |
| |
| /// A helper function that returns true if the given type is irregular. The |
| /// type is irregular if its allocated size doesn't equal the store size of an |
| /// element of the corresponding vector type at the given vectorization factor. |
| static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { |
| // Determine if an array of VF elements of type Ty is "bitcast compatible" |
| // with a <VF x Ty> vector. |
| if (VF > 1) { |
| auto *VectorTy = VectorType::get(Ty, VF); |
| return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); |
| } |
| |
| // If the vectorization factor is one, we just check if an array of type Ty |
| // requires padding between elements. |
| return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
| } |
| |
| /// A helper function that returns the reciprocal of the block probability of |
| /// predicated blocks. If we return X, we are assuming the predicated block |
| /// will execute once for for every X iterations of the loop header. |
| /// |
| /// TODO: We should use actual block probability here, if available. Currently, |
| /// we always assume predicated blocks have a 50% chance of executing. |
| static unsigned getReciprocalPredBlockProb() { return 2; } |
| |
| /// A helper function that adds a 'fast' flag to floating-point operations. |
| static Value *addFastMathFlag(Value *V) { |
| if (isa<FPMathOperator>(V)) { |
| FastMathFlags Flags; |
| Flags.setFast(); |
| cast<Instruction>(V)->setFastMathFlags(Flags); |
| } |
| return V; |
| } |
| |
| /// A helper function that returns an integer or floating-point constant with |
| /// value C. |
| static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
| return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) |
| : ConstantFP::get(Ty, C); |
| } |
| |
| namespace llvm { |
| |
| /// InnerLoopVectorizer vectorizes loops which contain only one basic |
| /// block to a specified vectorization factor (VF). |
| /// This class performs the widening of scalars into vectors, or multiple |
| /// scalars. This class also implements the following features: |
| /// * It inserts an epilogue loop for handling loops that don't have iteration |
| /// counts that are known to be a multiple of the vectorization factor. |
| /// * It handles the code generation for reduction variables. |
| /// * Scalarization (implementation using scalars) of un-vectorizable |
| /// instructions. |
| /// InnerLoopVectorizer does not perform any vectorization-legality |
| /// checks, and relies on the caller to check for the different legality |
| /// aspects. The InnerLoopVectorizer relies on the |
| /// LoopVectorizationLegality class to provide information about the induction |
| /// and reduction variables that were found to a given vectorization factor. |
| class InnerLoopVectorizer { |
| public: |
| InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned VecWidth, |
| unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM) |
| : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
| AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
| Builder(PSE.getSE()->getContext()), |
| VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} |
| virtual ~InnerLoopVectorizer() = default; |
| |
| /// Create a new empty loop. Unlink the old loop and connect the new one. |
| /// Return the pre-header block of the new loop. |
| BasicBlock *createVectorizedLoopSkeleton(); |
| |
| /// Widen a single instruction within the innermost loop. |
| void widenInstruction(Instruction &I); |
| |
| /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
| void fixVectorizedLoop(); |
| |
| // Return true if any runtime check is added. |
| bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
| |
| /// A type for vectorized values in the new loop. Each value from the |
| /// original loop, when vectorized, is represented by UF vector values in the |
| /// new unrolled loop, where UF is the unroll factor. |
| using VectorParts = SmallVector<Value *, 2>; |
| |
| /// Vectorize a single PHINode in a block. This method handles the induction |
| /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
| /// arbitrary length vectors. |
| void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); |
| |
| /// A helper function to scalarize a single Instruction in the innermost loop. |
| /// Generates a sequence of scalar instances for each lane between \p MinLane |
| /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
| /// inclusive.. |
| void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, |
| bool IfPredicateInstr); |
| |
| /// Widen an integer or floating-point induction variable \p IV. If \p Trunc |
| /// is provided, the integer induction variable will first be truncated to |
| /// the corresponding type. |
| void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); |
| |
| /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a |
| /// vector or scalar value on-demand if one is not yet available. When |
| /// vectorizing a loop, we visit the definition of an instruction before its |
| /// uses. When visiting the definition, we either vectorize or scalarize the |
| /// instruction, creating an entry for it in the corresponding map. (In some |
| /// cases, such as induction variables, we will create both vector and scalar |
| /// entries.) Then, as we encounter uses of the definition, we derive values |
| /// for each scalar or vector use unless such a value is already available. |
| /// For example, if we scalarize a definition and one of its uses is vector, |
| /// we build the required vector on-demand with an insertelement sequence |
| /// when visiting the use. Otherwise, if the use is scalar, we can use the |
| /// existing scalar definition. |
| /// |
| /// Return a value in the new loop corresponding to \p V from the original |
| /// loop at unroll index \p Part. If the value has already been vectorized, |
| /// the corresponding vector entry in VectorLoopValueMap is returned. If, |
| /// however, the value has a scalar entry in VectorLoopValueMap, we construct |
| /// a new vector value on-demand by inserting the scalar values into a vector |
| /// with an insertelement sequence. If the value has been neither vectorized |
| /// nor scalarized, it must be loop invariant, so we simply broadcast the |
| /// value into a vector. |
| Value *getOrCreateVectorValue(Value *V, unsigned Part); |
| |
| /// Return a value in the new loop corresponding to \p V from the original |
| /// loop at unroll and vector indices \p Instance. If the value has been |
| /// vectorized but not scalarized, the necessary extractelement instruction |
| /// will be generated. |
| Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); |
| |
| /// Construct the vector value of a scalarized value \p V one lane at a time. |
| void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); |
| |
| /// Try to vectorize the interleaved access group that \p Instr belongs to. |
| void vectorizeInterleaveGroup(Instruction *Instr); |
| |
| /// Vectorize Load and Store instructions, optionally masking the vector |
| /// operations if \p BlockInMask is non-null. |
| void vectorizeMemoryInstruction(Instruction *Instr, |
| VectorParts *BlockInMask = nullptr); |
| |
| /// \brief Set the debug location in the builder using the debug location in |
| /// the instruction. |
| void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); |
| |
| protected: |
| friend class LoopVectorizationPlanner; |
| |
| /// A small list of PHINodes. |
| using PhiVector = SmallVector<PHINode *, 4>; |
| |
| /// A type for scalarized values in the new loop. Each value from the |
| /// original loop, when scalarized, is represented by UF x VF scalar values |
| /// in the new unrolled loop, where UF is the unroll factor and VF is the |
| /// vectorization factor. |
| using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
| |
| /// Set up the values of the IVs correctly when exiting the vector loop. |
| void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
| Value *CountRoundDown, Value *EndValue, |
| BasicBlock *MiddleBlock); |
| |
| /// Create a new induction variable inside L. |
| PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, |
| Value *Step, Instruction *DL); |
| |
| /// Handle all cross-iteration phis in the header. |
| void fixCrossIterationPHIs(); |
| |
| /// Fix a first-order recurrence. This is the second phase of vectorizing |
| /// this phi node. |
| void fixFirstOrderRecurrence(PHINode *Phi); |
| |
| /// Fix a reduction cross-iteration phi. This is the second phase of |
| /// vectorizing this phi node. |
| void fixReduction(PHINode *Phi); |
| |
| /// \brief The Loop exit block may have single value PHI nodes with some |
| /// incoming value. While vectorizing we only handled real values |
| /// that were defined inside the loop and we should have one value for |
| /// each predecessor of its parent basic block. See PR14725. |
| void fixLCSSAPHIs(); |
| |
| /// Iteratively sink the scalarized operands of a predicated instruction into |
| /// the block that was created for it. |
| void sinkScalarOperands(Instruction *PredInst); |
| |
| /// Shrinks vector element sizes to the smallest bitwidth they can be legally |
| /// represented as. |
| void truncateToMinimalBitwidths(); |
| |
| /// Insert the new loop to the loop hierarchy and pass manager |
| /// and update the analysis passes. |
| void updateAnalysis(); |
| |
| /// Create a broadcast instruction. This method generates a broadcast |
| /// instruction (shuffle) for loop invariant values and for the induction |
| /// value. If this is the induction variable then we extend it to N, N+1, ... |
| /// this is needed because each iteration in the loop corresponds to a SIMD |
| /// element. |
| virtual Value *getBroadcastInstrs(Value *V); |
| |
| /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) |
| /// to each vector element of Val. The sequence starts at StartIndex. |
| /// \p Opcode is relevant for FP induction variable. |
| virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd); |
| |
| /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| /// variable on which to base the steps, \p Step is the size of the step, and |
| /// \p EntryVal is the value from the original loop that maps to the steps. |
| /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it |
| /// can be a truncate instruction). |
| void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, |
| const InductionDescriptor &ID); |
| |
| /// Create a vector induction phi node based on an existing scalar one. \p |
| /// EntryVal is the value from the original loop that maps to the vector phi |
| /// node, and \p Step is the loop-invariant step. If \p EntryVal is a |
| /// truncate instruction, instead of widening the original IV, we widen a |
| /// version of the IV truncated to \p EntryVal's type. |
| void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, |
| Value *Step, Instruction *EntryVal); |
| |
| /// Returns true if an instruction \p I should be scalarized instead of |
| /// vectorized for the chosen vectorization factor. |
| bool shouldScalarizeInstruction(Instruction *I) const; |
| |
| /// Returns true if we should generate a scalar version of \p IV. |
| bool needsScalarInduction(Instruction *IV) const; |
| |
| /// If there is a cast involved in the induction variable \p ID, which should |
| /// be ignored in the vectorized loop body, this function records the |
| /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the |
| /// cast. We had already proved that the casted Phi is equal to the uncasted |
| /// Phi in the vectorized loop (under a runtime guard), and therefore |
| /// there is no need to vectorize the cast - the same value can be used in the |
| /// vector loop for both the Phi and the cast. |
| /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, |
| /// Otherwise, \p VectorLoopValue is a widened/vectorized value. |
| void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, |
| Value *VectorLoopValue, |
| unsigned Part, |
| unsigned Lane = UINT_MAX); |
| |
| /// Generate a shuffle sequence that will reverse the vector Vec. |
| virtual Value *reverseVector(Value *Vec); |
| |
| /// Returns (and creates if needed) the original loop trip count. |
| Value *getOrCreateTripCount(Loop *NewLoop); |
| |
| /// Returns (and creates if needed) the trip count of the widened loop. |
| Value *getOrCreateVectorTripCount(Loop *NewLoop); |
| |
| /// Returns a bitcasted value to the requested vector type. |
| /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
| Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
| const DataLayout &DL); |
| |
| /// Emit a bypass check to see if the vector trip count is zero, including if |
| /// it overflows. |
| void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit a bypass check to see if all of the SCEV assumptions we've |
| /// had to make are correct. |
| void emitSCEVChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit bypass checks to check any memory assumptions we may have made. |
| void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Add additional metadata to \p To that was not present on \p Orig. |
| /// |
| /// Currently this is used to add the noalias annotations based on the |
| /// inserted memchecks. Use this for instructions that are *cloned* into the |
| /// vector loop. |
| void addNewMetadata(Instruction *To, const Instruction *Orig); |
| |
| /// Add metadata from one instruction to another. |
| /// |
| /// This includes both the original MDs from \p From and additional ones (\see |
| /// addNewMetadata). Use this for *newly created* instructions in the vector |
| /// loop. |
| void addMetadata(Instruction *To, Instruction *From); |
| |
| /// \brief Similar to the previous function but it adds the metadata to a |
| /// vector of instructions. |
| void addMetadata(ArrayRef<Value *> To, Instruction *From); |
| |
| /// The original loop. |
| Loop *OrigLoop; |
| |
| /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
| /// dynamic knowledge to simplify SCEV expressions and converts them to a |
| /// more usable form. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info. |
| LoopInfo *LI; |
| |
| /// Dominator Tree. |
| DominatorTree *DT; |
| |
| /// Alias Analysis. |
| AliasAnalysis *AA; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info. |
| const TargetTransformInfo *TTI; |
| |
| /// Assumption Cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| /// \brief LoopVersioning. It's only set up (non-null) if memchecks were |
| /// used. |
| /// |
| /// This is currently only used to add no-alias metadata based on the |
| /// memchecks. The actually versioning is performed manually. |
| std::unique_ptr<LoopVersioning> LVer; |
| |
| /// The vectorization SIMD factor to use. Each vector will have this many |
| /// vector elements. |
| unsigned VF; |
| |
| /// The vectorization unroll factor to use. Each scalar is vectorized to this |
| /// many different vector instructions. |
| unsigned UF; |
| |
| /// The builder that we use |
| IRBuilder<> Builder; |
| |
| // --- Vectorization state --- |
| |
| /// The vector-loop preheader. |
| BasicBlock *LoopVectorPreHeader; |
| |
| /// The scalar-loop preheader. |
| BasicBlock *LoopScalarPreHeader; |
| |
| /// Middle Block between the vector and the scalar. |
| BasicBlock *LoopMiddleBlock; |
| |
| /// The ExitBlock of the scalar loop. |
| BasicBlock *LoopExitBlock; |
| |
| /// The vector loop body. |
| BasicBlock *LoopVectorBody; |
| |
| /// The scalar loop body. |
| BasicBlock *LoopScalarBody; |
| |
| /// A list of all bypass blocks. The first block is the entry of the loop. |
| SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
| |
| /// The new Induction variable which was added to the new block. |
| PHINode *Induction = nullptr; |
| |
| /// The induction variable of the old basic block. |
| PHINode *OldInduction = nullptr; |
| |
| /// Maps values from the original loop to their corresponding values in the |
| /// vectorized loop. A key value can map to either vector values, scalar |
| /// values or both kinds of values, depending on whether the key was |
| /// vectorized and scalarized. |
| VectorizerValueMap VectorLoopValueMap; |
| |
| /// Store instructions that were predicated. |
| SmallVector<Instruction *, 4> PredicatedInstructions; |
| |
| /// Trip count of the original loop. |
| Value *TripCount = nullptr; |
| |
| /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
| Value *VectorTripCount = nullptr; |
| |
| /// The legality analysis. |
| LoopVectorizationLegality *Legal; |
| |
| /// The profitablity analysis. |
| LoopVectorizationCostModel *Cost; |
| |
| // Record whether runtime checks are added. |
| bool AddedSafetyChecks = false; |
| |
| // Holds the end values for each induction variable. We save the end values |
| // so we can later fix-up the external users of the induction variables. |
| DenseMap<PHINode *, Value *> IVEndValues; |
| }; |
| |
| class InnerLoopUnroller : public InnerLoopVectorizer { |
| public: |
| InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
| LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, |
| UnrollFactor, LVL, CM) {} |
| |
| private: |
| Value *getBroadcastInstrs(Value *V) override; |
| Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd) override; |
| Value *reverseVector(Value *Vec) override; |
| }; |
| |
| } // end namespace llvm |
| |
| /// \brief Look for a meaningful debug location on the instruction or it's |
| /// operands. |
| static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
| if (!I) |
| return I; |
| |
| DebugLoc Empty; |
| if (I->getDebugLoc() != Empty) |
| return I; |
| |
| for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { |
| if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) |
| if (OpInst->getDebugLoc() != Empty) |
| return OpInst; |
| } |
| |
| return I; |
| } |
| |
| void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
| if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { |
| const DILocation *DIL = Inst->getDebugLoc(); |
| if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && |
| !isa<DbgInfoIntrinsic>(Inst)) |
| B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF)); |
| else |
| B.SetCurrentDebugLocation(DIL); |
| } else |
| B.SetCurrentDebugLocation(DebugLoc()); |
| } |
| |
| #ifndef NDEBUG |
| /// \return string containing a file name and a line # for the given loop. |
| static std::string getDebugLocString(const Loop *L) { |
| std::string Result; |
| if (L) { |
| raw_string_ostream OS(Result); |
| if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
| LoopDbgLoc.print(OS); |
| else |
| // Just print the module name. |
| OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
| OS.flush(); |
| } |
| return Result; |
| } |
| #endif |
| |
| void InnerLoopVectorizer::addNewMetadata(Instruction *To, |
| const Instruction *Orig) { |
| // If the loop was versioned with memchecks, add the corresponding no-alias |
| // metadata. |
| if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) |
| LVer->annotateInstWithNoAlias(To, Orig); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(Instruction *To, |
| Instruction *From) { |
| propagateMetadata(To, From); |
| addNewMetadata(To, From); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, |
| Instruction *From) { |
| for (Value *V : To) { |
| if (Instruction *I = dyn_cast<Instruction>(V)) |
| addMetadata(I, From); |
| } |
| } |
| |
| namespace llvm { |
| |
| /// \brief The group of interleaved loads/stores sharing the same stride and |
| /// close to each other. |
| /// |
| /// Each member in this group has an index starting from 0, and the largest |
| /// index should be less than interleaved factor, which is equal to the absolute |
| /// value of the access's stride. |
| /// |
| /// E.g. An interleaved load group of factor 4: |
| /// for (unsigned i = 0; i < 1024; i+=4) { |
| /// a = A[i]; // Member of index 0 |
| /// b = A[i+1]; // Member of index 1 |
| /// d = A[i+3]; // Member of index 3 |
| /// ... |
| /// } |
| /// |
| /// An interleaved store group of factor 4: |
| /// for (unsigned i = 0; i < 1024; i+=4) { |
| /// ... |
| /// A[i] = a; // Member of index 0 |
| /// A[i+1] = b; // Member of index 1 |
| /// A[i+2] = c; // Member of index 2 |
| /// A[i+3] = d; // Member of index 3 |
| /// } |
| /// |
| /// Note: the interleaved load group could have gaps (missing members), but |
| /// the interleaved store group doesn't allow gaps. |
| class InterleaveGroup { |
| public: |
| InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) |
| : Align(Align), InsertPos(Instr) { |
| assert(Align && "The alignment should be non-zero"); |
| |
| Factor = std::abs(Stride); |
| assert(Factor > 1 && "Invalid interleave factor"); |
| |
| Reverse = Stride < 0; |
| Members[0] = Instr; |
| } |
| |
| bool isReverse() const { return Reverse; } |
| unsigned getFactor() const { return Factor; } |
| unsigned getAlignment() const { return Align; } |
| unsigned getNumMembers() const { return Members.size(); } |
| |
| /// \brief Try to insert a new member \p Instr with index \p Index and |
| /// alignment \p NewAlign. The index is related to the leader and it could be |
| /// negative if it is the new leader. |
| /// |
| /// \returns false if the instruction doesn't belong to the group. |
| bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { |
| assert(NewAlign && "The new member's alignment should be non-zero"); |
| |
| int Key = Index + SmallestKey; |
| |
| // Skip if there is already a member with the same index. |
| if (Members.count(Key)) |
| return false; |
| |
| if (Key > LargestKey) { |
| // The largest index is always less than the interleave factor. |
| if (Index >= static_cast<int>(Factor)) |
| return false; |
| |
| LargestKey = Key; |
| } else if (Key < SmallestKey) { |
| // The largest index is always less than the interleave factor. |
| if (LargestKey - Key >= static_cast<int>(Factor)) |
| return false; |
| |
| SmallestKey = Key; |
| } |
| |
| // It's always safe to select the minimum alignment. |
| Align = std::min(Align, NewAlign); |
| Members[Key] = Instr; |
| return true; |
| } |
| |
| /// \brief Get the member with the given index \p Index |
| /// |
| /// \returns nullptr if contains no such member. |
| Instruction *getMember(unsigned Index) const { |
| int Key = SmallestKey + Index; |
| if (!Members.count(Key)) |
| return nullptr; |
| |
| return Members.find(Key)->second; |
| } |
| |
| /// \brief Get the index for the given member. Unlike the key in the member |
| /// map, the index starts from 0. |
| unsigned getIndex(Instruction *Instr) const { |
| for (auto I : Members) |
| if (I.second == Instr) |
| return I.first - SmallestKey; |
| |
| llvm_unreachable("InterleaveGroup contains no such member"); |
| } |
| |
| Instruction *getInsertPos() const { return InsertPos; } |
| void setInsertPos(Instruction *Inst) { InsertPos = Inst; } |
| |
| /// Add metadata (e.g. alias info) from the instructions in this group to \p |
| /// NewInst. |
| /// |
| /// FIXME: this function currently does not add noalias metadata a'la |
| /// addNewMedata. To do that we need to compute the intersection of the |
| /// noalias info from all members. |
| void addMetadata(Instruction *NewInst) const { |
| SmallVector<Value *, 4> VL; |
| std::transform(Members.begin(), Members.end(), std::back_inserter(VL), |
| [](std::pair<int, Instruction *> p) { return p.second; }); |
| propagateMetadata(NewInst, VL); |
| } |
| |
| private: |
| unsigned Factor; // Interleave Factor. |
| bool Reverse; |
| unsigned Align; |
| DenseMap<int, Instruction *> Members; |
| int SmallestKey = 0; |
| int LargestKey = 0; |
| |
| // To avoid breaking dependences, vectorized instructions of an interleave |
| // group should be inserted at either the first load or the last store in |
| // program order. |
| // |
| // E.g. %even = load i32 // Insert Position |
| // %add = add i32 %even // Use of %even |
| // %odd = load i32 |
| // |
| // store i32 %even |
| // %odd = add i32 // Def of %odd |
| // store i32 %odd // Insert Position |
| Instruction *InsertPos; |
| }; |
| } // end namespace llvm |
| |
| namespace { |
| |
| /// \brief Drive the analysis of interleaved memory accesses in the loop. |
| /// |
| /// Use this class to analyze interleaved accesses only when we can vectorize |
| /// a loop. Otherwise it's meaningless to do analysis as the vectorization |
| /// on interleaved accesses is unsafe. |
| /// |
| /// The analysis collects interleave groups and records the relationships |
| /// between the member and the group in a map. |
| class InterleavedAccessInfo { |
| public: |
| InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, |
| DominatorTree *DT, LoopInfo *LI) |
| : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {} |
| |
| ~InterleavedAccessInfo() { |
| SmallSet<InterleaveGroup *, 4> DelSet; |
| // Avoid releasing a pointer twice. |
| for (auto &I : InterleaveGroupMap) |
| DelSet.insert(I.second); |
| for (auto *Ptr : DelSet) |
| delete Ptr; |
| } |
| |
| /// \brief Analyze the interleaved accesses and collect them in interleave |
| /// groups. Substitute symbolic strides using \p Strides. |
| void analyzeInterleaving(const ValueToValueMap &Strides); |
| |
| /// \brief Check if \p Instr belongs to any interleave group. |
| bool isInterleaved(Instruction *Instr) const { |
| return InterleaveGroupMap.count(Instr); |
| } |
| |
| /// \brief Get the interleave group that \p Instr belongs to. |
| /// |
| /// \returns nullptr if doesn't have such group. |
| InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { |
| if (InterleaveGroupMap.count(Instr)) |
| return InterleaveGroupMap.find(Instr)->second; |
| return nullptr; |
| } |
| |
| /// \brief Returns true if an interleaved group that may access memory |
| /// out-of-bounds requires a scalar epilogue iteration for correctness. |
| bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } |
| |
| /// \brief Initialize the LoopAccessInfo used for dependence checking. |
| void setLAI(const LoopAccessInfo *Info) { LAI = Info; } |
| |
| private: |
| /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. |
| /// Simplifies SCEV expressions in the context of existing SCEV assumptions. |
| /// The interleaved access analysis can also add new predicates (for example |
| /// by versioning strides of pointers). |
| PredicatedScalarEvolution &PSE; |
| |
| Loop *TheLoop; |
| DominatorTree *DT; |
| LoopInfo *LI; |
| const LoopAccessInfo *LAI = nullptr; |
| |
| /// True if the loop may contain non-reversed interleaved groups with |
| /// out-of-bounds accesses. We ensure we don't speculatively access memory |
| /// out-of-bounds by executing at least one scalar epilogue iteration. |
| bool RequiresScalarEpilogue = false; |
| |
| /// Holds the relationships between the members and the interleave group. |
| DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; |
| |
| /// Holds dependences among the memory accesses in the loop. It maps a source |
| /// access to a set of dependent sink accesses. |
| DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; |
| |
| /// \brief The descriptor for a strided memory access. |
| struct StrideDescriptor { |
| StrideDescriptor() = default; |
| StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, |
| unsigned Align) |
| : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} |
| |
| // The access's stride. It is negative for a reverse access. |
| int64_t Stride = 0; |
| |
| // The scalar expression of this access. |
| const SCEV *Scev = nullptr; |
| |
| // The size of the memory object. |
| uint64_t Size = 0; |
| |
| // The alignment of this access. |
| unsigned Align = 0; |
| }; |
| |
| /// \brief A type for holding instructions and their stride descriptors. |
| using StrideEntry = std::pair<Instruction *, StrideDescriptor>; |
| |
| /// \brief Create a new interleave group with the given instruction \p Instr, |
| /// stride \p Stride and alignment \p Align. |
| /// |
| /// \returns the newly created interleave group. |
| InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, |
| unsigned Align) { |
| assert(!InterleaveGroupMap.count(Instr) && |
| "Already in an interleaved access group"); |
| InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); |
| return InterleaveGroupMap[Instr]; |
| } |
| |
| /// \brief Release the group and remove all the relationships. |
| void releaseGroup(InterleaveGroup *Group) { |
| for (unsigned i = 0; i < Group->getFactor(); i++) |
| if (Instruction *Member = Group->getMember(i)) |
| InterleaveGroupMap.erase(Member); |
| |
| delete Group; |
| } |
| |
| /// \brief Collect all the accesses with a constant stride in program order. |
| void collectConstStrideAccesses( |
| MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, |
| const ValueToValueMap &Strides); |
| |
| /// \brief Returns true if \p Stride is allowed in an interleaved group. |
| static bool isStrided(int Stride) { |
| unsigned Factor = std::abs(Stride); |
| return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; |
| } |
| |
| /// \brief Returns true if \p BB is a predicated block. |
| bool isPredicated(BasicBlock *BB) const { |
| return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); |
| } |
| |
| /// \brief Returns true if LoopAccessInfo can be used for dependence queries. |
| bool areDependencesValid() const { |
| return LAI && LAI->getDepChecker().getDependences(); |
| } |
| |
| /// \brief Returns true if memory accesses \p A and \p B can be reordered, if |
| /// necessary, when constructing interleaved groups. |
| /// |
| /// \p A must precede \p B in program order. We return false if reordering is |
| /// not necessary or is prevented because \p A and \p B may be dependent. |
| bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, |
| StrideEntry *B) const { |
| // Code motion for interleaved accesses can potentially hoist strided loads |
| // and sink strided stores. The code below checks the legality of the |
| // following two conditions: |
| // |
| // 1. Potentially moving a strided load (B) before any store (A) that |
| // precedes B, or |
| // |
| // 2. Potentially moving a strided store (A) after any load or store (B) |
| // that A precedes. |
| // |
| // It's legal to reorder A and B if we know there isn't a dependence from A |
| // to B. Note that this determination is conservative since some |
| // dependences could potentially be reordered safely. |
| |
| // A is potentially the source of a dependence. |
| auto *Src = A->first; |
| auto SrcDes = A->second; |
| |
| // B is potentially the sink of a dependence. |
| auto *Sink = B->first; |
| auto SinkDes = B->second; |
| |
| // Code motion for interleaved accesses can't violate WAR dependences. |
| // Thus, reordering is legal if the source isn't a write. |
| if (!Src->mayWriteToMemory()) |
| return true; |
| |
| // At least one of the accesses must be strided. |
| if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) |
| return true; |
| |
| // If dependence information is not available from LoopAccessInfo, |
| // conservatively assume the instructions can't be reordered. |
| if (!areDependencesValid()) |
| return false; |
| |
| // If we know there is a dependence from source to sink, assume the |
| // instructions can't be reordered. Otherwise, reordering is legal. |
| return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); |
| } |
| |
| /// \brief Collect the dependences from LoopAccessInfo. |
| /// |
| /// We process the dependences once during the interleaved access analysis to |
| /// enable constant-time dependence queries. |
| void collectDependences() { |
| if (!areDependencesValid()) |
| return; |
| auto *Deps = LAI->getDepChecker().getDependences(); |
| for (auto Dep : *Deps) |
| Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); |
| } |
| }; |
| |
| /// Utility class for getting and setting loop vectorizer hints in the form |
| /// of loop metadata. |
| /// This class keeps a number of loop annotations locally (as member variables) |
| /// and can, upon request, write them back as metadata on the loop. It will |
| /// initially scan the loop for existing metadata, and will update the local |
| /// values based on information in the loop. |
| /// We cannot write all values to metadata, as the mere presence of some info, |
| /// for example 'force', means a decision has been made. So, we need to be |
| /// careful NOT to add them if the user hasn't specifically asked so. |
| class LoopVectorizeHints { |
| enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; |
| |
| /// Hint - associates name and validation with the hint value. |
| struct Hint { |
| const char *Name; |
| unsigned Value; // This may have to change for non-numeric values. |
| HintKind Kind; |
| |
| Hint(const char *Name, unsigned Value, HintKind Kind) |
| : Name(Name), Value(Value), Kind(Kind) {} |
| |
| bool validate(unsigned Val) { |
| switch (Kind) { |
| case HK_WIDTH: |
| return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; |
| case HK_UNROLL: |
| return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; |
| case HK_FORCE: |
| return (Val <= 1); |
| case HK_ISVECTORIZED: |
| return (Val==0 || Val==1); |
| } |
| return false; |
| } |
| }; |
| |
| /// Vectorization width. |
| Hint Width; |
| |
| /// Vectorization interleave factor. |
| Hint Interleave; |
| |
| /// Vectorization forced |
| Hint Force; |
| |
| /// Already Vectorized |
| Hint IsVectorized; |
| |
| /// Return the loop metadata prefix. |
| static StringRef Prefix() { return "llvm.loop."; } |
| |
| /// True if there is any unsafe math in the loop. |
| bool PotentiallyUnsafe = false; |
| |
| public: |
| enum ForceKind { |
| FK_Undefined = -1, ///< Not selected. |
| FK_Disabled = 0, ///< Forcing disabled. |
| FK_Enabled = 1, ///< Forcing enabled. |
| }; |
| |
| LoopVectorizeHints(const Loop *L, bool DisableInterleaving, |
| OptimizationRemarkEmitter &ORE) |
| : Width("vectorize.width", VectorizerParams::VectorizationFactor, |
| HK_WIDTH), |
| Interleave("interleave.count", DisableInterleaving, HK_UNROLL), |
| Force("vectorize.enable", FK_Undefined, HK_FORCE), |
| IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { |
| // Populate values with existing loop metadata. |
| getHintsFromMetadata(); |
| |
| // force-vector-interleave overrides DisableInterleaving. |
| if (VectorizerParams::isInterleaveForced()) |
| Interleave.Value = VectorizerParams::VectorizationInterleave; |
| |
| if (IsVectorized.Value != 1) |
| // If the vectorization width and interleaving count are both 1 then |
| // consider the loop to have been already vectorized because there's |
| // nothing more that we can do. |
| IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; |
| DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() |
| << "LV: Interleaving disabled by the pass manager\n"); |
| } |
| |
| /// Mark the loop L as already vectorized by setting the width to 1. |
| void setAlreadyVectorized() { |
| IsVectorized.Value = 1; |
| Hint Hints[] = {IsVectorized}; |
| writeHintsToMetadata(Hints); |
| } |
| |
| bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { |
| if (getForce() == LoopVectorizeHints::FK_Disabled) { |
| DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); |
| emitRemarkWithHints(); |
| return false; |
| } |
| |
| if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { |
| DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); |
| emitRemarkWithHints(); |
| return false; |
| } |
| |
| if (getIsVectorized() == 1) { |
| DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); |
| // FIXME: Add interleave.disable metadata. This will allow |
| // vectorize.disable to be used without disabling the pass and errors |
| // to differentiate between disabled vectorization and a width of 1. |
| ORE.emit([&]() { |
| return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), |
| "AllDisabled", L->getStartLoc(), |
| L->getHeader()) |
| << "loop not vectorized: vectorization and interleaving are " |
| "explicitly disabled, or the loop has already been " |
| "vectorized"; |
| }); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /// Dumps all the hint information. |
| void emitRemarkWithHints() const { |
| using namespace ore; |
| |
| ORE.emit([&]() { |
| if (Force.Value == LoopVectorizeHints::FK_Disabled) |
| return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", |
| TheLoop->getStartLoc(), |
| TheLoop->getHeader()) |
| << "loop not vectorized: vectorization is explicitly disabled"; |
| else { |
| OptimizationRemarkMissed R(LV_NAME, "MissedDetails", |
| TheLoop->getStartLoc(), |
| TheLoop->getHeader()); |
| R << "loop not vectorized"; |
| if (Force.Value == LoopVectorizeHints::FK_Enabled) { |
| R << " (Force=" << NV("Force", true); |
| if (Width.Value != 0) |
| R << ", Vector Width=" << NV("VectorWidth", Width.Value); |
| if (Interleave.Value != 0) |
| R << ", Interleave Count=" |
| << NV("InterleaveCount", Interleave.Value); |
| R << ")"; |
| } |
| return R; |
| } |
| }); |
| } |
| |
| unsigned getWidth() const { return Width.Value; } |
| unsigned getInterleave() const { return Interleave.Value; } |
| unsigned getIsVectorized() const { return IsVectorized.Value; } |
| enum ForceKind getForce() const { return (ForceKind)Force.Value; } |
| |
| /// \brief If hints are provided that force vectorization, use the AlwaysPrint |
| /// pass name to force the frontend to print the diagnostic. |
| const char *vectorizeAnalysisPassName() const { |
| if (getWidth() == 1) |
| return LV_NAME; |
| if (getForce() == LoopVectorizeHints::FK_Disabled) |
| return LV_NAME; |
| if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) |
| return LV_NAME; |
| return OptimizationRemarkAnalysis::AlwaysPrint; |
| } |
| |
| bool allowReordering() const { |
| // When enabling loop hints are provided we allow the vectorizer to change |
| // the order of operations that is given by the scalar loop. This is not |
| // enabled by default because can be unsafe or inefficient. For example, |
| // reordering floating-point operations will change the way round-off |
| // error accumulates in the loop. |
| return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; |
| } |
| |
| bool isPotentiallyUnsafe() const { |
| // Avoid FP vectorization if the target is unsure about proper support. |
| // This may be related to the SIMD unit in the target not handling |
| // IEEE 754 FP ops properly, or bad single-to-double promotions. |
| // Otherwise, a sequence of vectorized loops, even without reduction, |
| // could lead to different end results on the destination vectors. |
| return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; |
| } |
| |
| void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } |
| |
| private: |
| /// Find hints specified in the loop metadata and update local values. |
| void getHintsFromMetadata() { |
| MDNode *LoopID = TheLoop->getLoopID(); |
| if (!LoopID) |
| return; |
| |
| // First operand should refer to the loop id itself. |
| assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); |
| assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); |
| |
| for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
| const MDString *S = nullptr; |
| SmallVector<Metadata *, 4> Args; |
| |
| // The expected hint is either a MDString or a MDNode with the first |
| // operand a MDString. |
| if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { |
| if (!MD || MD->getNumOperands() == 0) |
| continue; |
| S = dyn_cast<MDString>(MD->getOperand(0)); |
| for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) |
| Args.push_back(MD->getOperand(i)); |
| } else { |
| S = dyn_cast<MDString>(LoopID->getOperand(i)); |
| assert(Args.size() == 0 && "too many arguments for MDString"); |
| } |
| |
| if (!S) |
| continue; |
| |
| // Check if the hint starts with the loop metadata prefix. |
| StringRef Name = S->getString(); |
| if (Args.size() == 1) |
| setHint(Name, Args[0]); |
| } |
| } |
| |
| /// Checks string hint with one operand and set value if valid. |
| void setHint(StringRef Name, Metadata *Arg) { |
| if (!Name.startswith(Prefix())) |
| return; |
| Name = Name.substr(Prefix().size(), StringRef::npos); |
| |
| const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); |
| if (!C) |
| return; |
| unsigned Val = C->getZExtValue(); |
| |
| Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; |
| for (auto H : Hints) { |
| if (Name == H->Name) { |
| if (H->validate(Val)) |
| H->Value = Val; |
| else |
| DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); |
| break; |
| } |
| } |
| } |
| |
| /// Create a new hint from name / value pair. |
| MDNode *createHintMetadata(StringRef Name, unsigned V) const { |
| LLVMContext &Context = TheLoop->getHeader()->getContext(); |
| Metadata *MDs[] = {MDString::get(Context, Name), |
| ConstantAsMetadata::get( |
| ConstantInt::get(Type::getInt32Ty(Context), V))}; |
| return MDNode::get(Context, MDs); |
| } |
| |
| /// Matches metadata with hint name. |
| bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { |
| MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); |
| if (!Name) |
| return false; |
| |
| for (auto H : HintTypes) |
| if (Name->getString().endswith(H.Name)) |
| return true; |
| return false; |
| } |
| |
| /// Sets current hints into loop metadata, keeping other values intact. |
| void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { |
| if (HintTypes.empty()) |
| return; |
| |
| // Reserve the first element to LoopID (see below). |
| SmallVector<Metadata *, 4> MDs(1); |
| // If the loop already has metadata, then ignore the existing operands. |
| MDNode *LoopID = TheLoop->getLoopID(); |
| if (LoopID) { |
| for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
| MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); |
| // If node in update list, ignore old value. |
| if (!matchesHintMetadataName(Node, HintTypes)) |
| MDs.push_back(Node); |
| } |
| } |
| |
| // Now, add the missing hints. |
| for (auto H : HintTypes) |
| MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); |
| |
| // Replace current metadata node with new one. |
| LLVMContext &Context = TheLoop->getHeader()->getContext(); |
| MDNode *NewLoopID = MDNode::get(Context, MDs); |
| // Set operand 0 to refer to the loop id itself. |
| NewLoopID->replaceOperandWith(0, NewLoopID); |
| |
| TheLoop->setLoopID(NewLoopID); |
| } |
| |
| /// The loop these hints belong to. |
| const Loop *TheLoop; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter &ORE; |
| }; |
| |
| } // end anonymous namespace |
| |
| static void emitMissedWarning(Function *F, Loop *L, |
| const LoopVectorizeHints &LH, |
| OptimizationRemarkEmitter *ORE) { |
| LH.emitRemarkWithHints(); |
| |
| if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { |
| if (LH.getWidth() != 1) |
| ORE->emit(DiagnosticInfoOptimizationFailure( |
| DEBUG_TYPE, "FailedRequestedVectorization", |
| L->getStartLoc(), L->getHeader()) |
| << "loop not vectorized: " |
| << "failed explicitly specified loop vectorization"); |
| else if (LH.getInterleave() != 1) |
| ORE->emit(DiagnosticInfoOptimizationFailure( |
| DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(), |
| L->getHeader()) |
| << "loop not interleaved: " |
| << "failed explicitly specified loop interleaving"); |
| } |
| } |
| |
| namespace { |
| |
| /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and |
| /// to what vectorization factor. |
| /// This class does not look at the profitability of vectorization, only the |
| /// legality. This class has two main kinds of checks: |
| /// * Memory checks - The code in canVectorizeMemory checks if vectorization |
| /// will change the order of memory accesses in a way that will change the |
| /// correctness of the program. |
| /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory |
| /// checks for a number of different conditions, such as the availability of a |
| /// single induction variable, that all types are supported and vectorize-able, |
| /// etc. This code reflects the capabilities of InnerLoopVectorizer. |
| /// This class is also used by InnerLoopVectorizer for identifying |
| /// induction variable and the different reduction variables. |
| class LoopVectorizationLegality { |
| public: |
| LoopVectorizationLegality( |
| Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, |
| TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, |
| const TargetTransformInfo *TTI, |
| std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, |
| OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, |
| LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) |
| : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), |
| ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H), |
| DB(DB), AC(AC) {} |
| |
| /// ReductionList contains the reduction descriptors for all |
| /// of the reductions that were found in the loop. |
| using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>; |
| |
| /// InductionList saves induction variables and maps them to the |
| /// induction descriptor. |
| using InductionList = MapVector<PHINode *, InductionDescriptor>; |
| |
| /// RecurrenceSet contains the phi nodes that are recurrences other than |
| /// inductions and reductions. |
| using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; |
| |
| /// Returns true if it is legal to vectorize this loop. |
| /// This does not mean that it is profitable to vectorize this |
| /// loop, only that it is legal to do so. |
| bool canVectorize(); |
| |
| /// Returns the primary induction variable. |
| PHINode *getPrimaryInduction() { return PrimaryInduction; } |
| |
| /// Returns the reduction variables found in the loop. |
| ReductionList *getReductionVars() { return &Reductions; } |
| |
| /// Returns the induction variables found in the loop. |
| InductionList *getInductionVars() { return &Inductions; } |
| |
| /// Return the first-order recurrences found in the loop. |
| RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } |
| |
| /// Return the set of instructions to sink to handle first-order recurrences. |
| DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; } |
| |
| /// Returns the widest induction type. |
| Type *getWidestInductionType() { return WidestIndTy; } |
| |
| /// Returns True if V is a Phi node of an induction variable in this loop. |
| bool isInductionPhi(const Value *V); |
| |
| /// Returns True if V is a cast that is part of an induction def-use chain, |
| /// and had been proven to be redundant under a runtime guard (in other |
| /// words, the cast has the same SCEV expression as the induction phi). |
| bool isCastedInductionVariable(const Value *V); |
| |
| /// Returns True if V can be considered as an induction variable in this |
| /// loop. V can be the induction phi, or some redundant cast in the def-use |
| /// chain of the inducion phi. |
| bool isInductionVariable(const Value *V); |
| |
| /// Returns True if PN is a reduction variable in this loop. |
| bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } |
| |
| /// Returns True if Phi is a first-order recurrence in this loop. |
| bool isFirstOrderRecurrence(const PHINode *Phi); |
| |
| /// Return true if the block BB needs to be predicated in order for the loop |
| /// to be vectorized. |
| bool blockNeedsPredication(BasicBlock *BB); |
| |
| /// Check if this pointer is consecutive when vectorizing. This happens |
| /// when the last index of the GEP is the induction variable, or that the |
| /// pointer itself is an induction variable. |
| /// This check allows us to vectorize A[idx] into a wide load/store. |
| /// Returns: |
| /// 0 - Stride is unknown or non-consecutive. |
| /// 1 - Address is consecutive. |
| /// -1 - Address is consecutive, and decreasing. |
| /// NOTE: This method must only be used before modifying the original scalar |
| /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). |
| int isConsecutivePtr(Value *Ptr); |
| |
| /// Returns true if the value V is uniform within the loop. |
| bool isUniform(Value *V); |
| |
| /// Returns the information that we collected about runtime memory check. |
| const RuntimePointerChecking *getRuntimePointerChecking() const { |
| return LAI->getRuntimePointerChecking(); |
| } |
| |
| const LoopAccessInfo *getLAI() const { return LAI; } |
| |
| /// \brief Check if \p Instr belongs to any interleaved access group. |
| bool isAccessInterleaved(Instruction *Instr) { |
| return InterleaveInfo.isInterleaved(Instr); |
| } |
| |
| /// \brief Get the interleaved access group that \p Instr belongs to. |
| const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { |
| return InterleaveInfo.getInterleaveGroup(Instr); |
| } |
| |
| /// \brief Returns true if an interleaved group requires a scalar iteration |
| /// to handle accesses with gaps. |
| bool requiresScalarEpilogue() const { |
| return InterleaveInfo.requiresScalarEpilogue(); |
| } |
| |
| unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } |
| |
| uint64_t getMaxSafeRegisterWidth() const { |
| return LAI->getDepChecker().getMaxSafeRegisterWidth(); |
| } |
| |
| bool hasStride(Value *V) { return LAI->hasStride(V); } |
| |
| /// Returns true if the target machine supports masked store operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedStore(Type *DataType, Value *Ptr) { |
| return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); |
| } |
| |
| /// Returns true if the target machine supports masked load operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { |
| return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); |
| } |
| |
| /// Returns true if the target machine supports masked scatter operation |
| /// for the given \p DataType. |
| bool isLegalMaskedScatter(Type *DataType) { |
| return TTI->isLegalMaskedScatter(DataType); |
| } |
| |
| /// Returns true if the target machine supports masked gather operation |
| /// for the given \p DataType. |
| bool isLegalMaskedGather(Type *DataType) { |
| return TTI->isLegalMaskedGather(DataType); |
| } |
| |
| /// Returns true if the target machine can represent \p V as a masked gather |
| /// or scatter operation. |
| bool isLegalGatherOrScatter(Value *V) { |
| auto *LI = dyn_cast<LoadInst>(V); |
| auto *SI = dyn_cast<StoreInst>(V); |
| if (!LI && !SI) |
| return false; |
| auto *Ptr = getPointerOperand(V); |
| auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); |
| return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); |
| } |
| |
| /// Returns true if vector representation of the instruction \p I |
| /// requires mask. |
| bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } |
| |
| unsigned getNumStores() const { return LAI->getNumStores(); } |
| unsigned getNumLoads() const { return LAI->getNumLoads(); } |
| unsigned getNumPredStores() const { return NumPredStores; } |
| |
| /// Returns true if \p I is an instruction that will be scalarized with |
| /// predication. Such instructions include conditional stores and |
| /// instructions that may divide by zero. |
| bool isScalarWithPredication(Instruction *I); |
| |
| /// Returns true if \p I is a memory instruction with consecutive memory |
| /// access that can be widened. |
| bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); |
| |
| // Returns true if the NoNaN attribute is set on the function. |
| bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } |
| |
| private: |
| /// Check if a single basic block loop is vectorizable. |
| /// At this point we know that this is a loop with a constant trip count |
| /// and we only need to check individual instructions. |
| bool canVectorizeInstrs(); |
| |
| /// When we vectorize loops we may change the order in which |
| /// we read and write from memory. This method checks if it is |
| /// legal to vectorize the code, considering only memory constrains. |
| /// Returns true if the loop is vectorizable |
| bool canVectorizeMemory(); |
| |
| /// Return true if we can vectorize this loop using the IF-conversion |
| /// transformation. |
| bool canVectorizeWithIfConvert(); |
| |
| /// Return true if all of the instructions in the block can be speculatively |
| /// executed. \p SafePtrs is a list of addresses that are known to be legal |
| /// and we know that we can read from them without segfault. |
| bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); |
| |
| /// Updates the vectorization state by adding \p Phi to the inductions list. |
| /// This can set \p Phi as the main induction of the loop if \p Phi is a |
| /// better choice for the main induction than the existing one. |
| void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, |
| SmallPtrSetImpl<Value *> &AllowedExit); |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p RemarkName is the identifier for the remark. If \p I is passed it is |
| /// an instruction that prevents vectorization. Otherwise the loop is used |
| /// for the location of the remark. \return the remark object that can be |
| /// streamed to. |
| OptimizationRemarkAnalysis |
| createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { |
| return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), |
| RemarkName, TheLoop, I); |
| } |
| |
| /// \brief If an access has a symbolic strides, this maps the pointer value to |
| /// the stride symbol. |
| const ValueToValueMap *getSymbolicStrides() { |
| // FIXME: Currently, the set of symbolic strides is sometimes queried before |
| // it's collected. This happens from canVectorizeWithIfConvert, when the |
| // pointer is checked to reference consecutive elements suitable for a |
| // masked access. |
| return LAI ? &LAI->getSymbolicStrides() : nullptr; |
| } |
| |
| unsigned NumPredStores = 0; |
| |
| /// The loop that we evaluate. |
| Loop *TheLoop; |
| |
| /// A wrapper around ScalarEvolution used to add runtime SCEV checks. |
| /// Applies dynamic knowledge to simplify SCEV expressions in the context |
| /// of existing SCEV assumptions. The analysis will also add a minimal set |
| /// of new predicates if this is required to enable vectorization and |
| /// unrolling. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Target Library Info. |
| TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info |
| const TargetTransformInfo *TTI; |
| |
| /// Dominator Tree. |
| DominatorTree *DT; |
| |
| // LoopAccess analysis. |
| std::function<const LoopAccessInfo &(Loop &)> *GetLAA; |
| |
| // And the loop-accesses info corresponding to this loop. This pointer is |
| // null until canVectorizeMemory sets it up. |
| const LoopAccessInfo *LAI = nullptr; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| /// The interleave access information contains groups of interleaved accesses |
| /// with the same stride and close to each other. |
| InterleavedAccessInfo InterleaveInfo; |
| |
| // --- vectorization state --- // |
| |
| /// Holds the primary induction variable. This is the counter of the |
| /// loop. |
| PHINode *PrimaryInduction = nullptr; |
| |
| /// Holds the reduction variables. |
| ReductionList Reductions; |
| |
| /// Holds all of the induction variables that we found in the loop. |
| /// Notice that inductions don't need to start at zero and that induction |
| /// variables can be pointers. |
| InductionList Inductions; |
| |
| /// Holds all the casts that participate in the update chain of the induction |
| /// variables, and that have been proven to be redundant (possibly under a |
| /// runtime guard). These casts can be ignored when creating the vectorized |
| /// loop body. |
| SmallPtrSet<Instruction *, 4> InductionCastsToIgnore; |
| |
| /// Holds the phi nodes that are first-order recurrences. |
| RecurrenceSet FirstOrderRecurrences; |
| |
| /// Holds instructions that need to sink past other instructions to handle |
| /// first-order recurrences. |
| DenseMap<Instruction *, Instruction *> SinkAfter; |
| |
| /// Holds the widest induction type encountered. |
| Type *WidestIndTy = nullptr; |
| |
| /// Allowed outside users. This holds the induction and reduction |
| /// vars which can be accessed from outside the loop. |
| SmallPtrSet<Value *, 4> AllowedExit; |
| |
| /// Can we assume the absence of NaNs. |
| bool HasFunNoNaNAttr = false; |
| |
| /// Vectorization requirements that will go through late-evaluation. |
| LoopVectorizationRequirements *Requirements; |
| |
| /// Used to emit an analysis of any legality issues. |
| LoopVectorizeHints *Hints; |
| |
| /// The demanded bits analsyis is used to compute the minimum type size in |
| /// which a reduction can be computed. |
| DemandedBits *DB; |
| |
| /// The assumption cache analysis is used to compute the minimum type size in |
| /// which a reduction can be computed. |
| AssumptionCache *AC; |
| |
| /// While vectorizing these instructions we have to generate a |
| /// call to the appropriate masked intrinsic |
| SmallPtrSet<const Instruction *, 8> MaskedOp; |
| }; |
| |
| /// LoopVectorizationCostModel - estimates the expected speedups due to |
| /// vectorization. |
| /// In many cases vectorization is not profitable. This can happen because of |
| /// a number of reasons. In this class we mainly attempt to predict the |
| /// expected speedup/slowdowns due to the supported instruction set. We use the |
| /// TargetTransformInfo to query the different backends for the cost of |
| /// different operations. |
| class LoopVectorizationCostModel { |
| public: |
| LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, LoopVectorizationLegality *Legal, |
| const TargetTransformInfo &TTI, |
| const TargetLibraryInfo *TLI, DemandedBits *DB, |
| AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, const Function *F, |
| const LoopVectorizeHints *Hints) |
| : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), |
| AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} |
| |
| /// \return An upper bound for the vectorization factor, or None if |
| /// vectorization should be avoided up front. |
| Optional<unsigned> computeMaxVF(bool OptForSize); |
| |
| /// Information about vectorization costs |
| struct VectorizationFactor { |
| // Vector width with best cost |
| unsigned Width; |
| |
| // Cost of the loop with that width |
| unsigned Cost; |
| }; |
| |
| /// \return The most profitable vectorization factor and the cost of that VF. |
| /// This method checks every power of two up to MaxVF. If UserVF is not ZERO |
| /// then this vectorization factor will be selected if vectorization is |
| /// possible. |
| VectorizationFactor selectVectorizationFactor(unsigned MaxVF); |
| |
| /// Setup cost-based decisions for user vectorization factor. |
| void selectUserVectorizationFactor(unsigned UserVF) { |
| collectUniformsAndScalars(UserVF); |
| collectInstsToScalarize(UserVF); |
| } |
| |
| /// \return The size (in bits) of the smallest and widest types in the code |
| /// that needs to be vectorized. We ignore values that remain scalar such as |
| /// 64 bit loop indices. |
| std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
| |
| /// \return The desired interleave count. |
| /// If interleave count has been specified by metadata it will be returned. |
| /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
| /// are the selected vectorization factor and the cost of the selected VF. |
| unsigned selectInterleaveCount(bool OptForSize, unsigned VF, |
| unsigned LoopCost); |
| |
| /// Memory access instruction may be vectorized in more than one way. |
| /// Form of instruction after vectorization depends on cost. |
| /// This function takes cost-based decisions for Load/Store instructions |
| /// and collects them in a map. This decisions map is used for building |
| /// the lists of loop-uniform and loop-scalar instructions. |
| /// The calculated cost is saved with widening decision in order to |
| /// avoid redundant calculations. |
| void setCostBasedWideningDecision(unsigned VF); |
| |
| /// \brief A struct that represents some properties of the register usage |
| /// of a loop. |
| struct RegisterUsage { |
| /// Holds the number of loop invariant values that are used in the loop. |
| unsigned LoopInvariantRegs; |
| |
| /// Holds the maximum number of concurrent live intervals in the loop. |
| unsigned MaxLocalUsers; |
| |
| /// Holds the number of instructions in the loop. |
| unsigned NumInstructions; |
| }; |
| |
| /// \return Returns information about the register usages of the loop for the |
| /// given vectorization factors. |
| SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); |
| |
| /// Collect values we want to ignore in the cost model. |
| void collectValuesToIgnore(); |
| |
| /// \returns The smallest bitwidth each instruction can be represented with. |
| /// The vector equivalents of these instructions should be truncated to this |
| /// type. |
| const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
| return MinBWs; |
| } |
| |
| /// \returns True if it is more profitable to scalarize instruction \p I for |
| /// vectorization factor \p VF. |
| bool isProfitableToScalarize(Instruction *I, unsigned VF) const { |
| assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); |
| auto Scalars = InstsToScalarize.find(VF); |
| assert(Scalars != InstsToScalarize.end() && |
| "VF not yet analyzed for scalarization profitability"); |
| return Scalars->second.count(I); |
| } |
| |
| /// Returns true if \p I is known to be uniform after vectorization. |
| bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { |
| if (VF == 1) |
| return true; |
| assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); |
| auto UniformsPerVF = Uniforms.find(VF); |
| return UniformsPerVF->second.count(I); |
| } |
| |
| /// Returns true if \p I is known to be scalar after vectorization. |
| bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { |
| if (VF == 1) |
| return true; |
| assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); |
| auto ScalarsPerVF = Scalars.find(VF); |
| return ScalarsPerVF->second.count(I); |
| } |
| |
| /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
| /// for vectorization factor \p VF. |
| bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { |
| return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && |
| !isScalarAfterVectorization(I, VF); |
| } |
| |
| /// Decision that was taken during cost calculation for memory instruction. |
| enum InstWidening { |
| CM_Unknown, |
| CM_Widen, // For consecutive accesses with stride +1. |
| CM_Widen_Reverse, // For consecutive accesses with stride -1. |
| CM_Interleave, |
| CM_GatherScatter, |
| CM_Scalarize |
| }; |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// instruction \p I and vector width \p VF. |
| void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, |
| unsigned Cost) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| } |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// interleaving group \p Grp and vector width \p VF. |
| void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, |
| InstWidening W, unsigned Cost) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| /// Broadcast this decicion to all instructions inside the group. |
| /// But the cost will be assigned to one instruction only. |
| for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
| if (auto *I = Grp->getMember(i)) { |
| if (Grp->getInsertPos() == I) |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| else |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); |
| } |
| } |
| } |
| |
| /// Return the cost model decision for the given instruction \p I and vector |
| /// width \p VF. Return CM_Unknown if this instruction did not pass |
| /// through the cost modeling. |
| InstWidening getWideningDecision(Instruction *I, unsigned VF) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
| auto Itr = WideningDecisions.find(InstOnVF); |
| if (Itr == WideningDecisions.end()) |
| return CM_Unknown; |
| return Itr->second.first; |
| } |
| |
| /// Return the vectorization cost for the given instruction \p I and vector |
| /// width \p VF. |
| unsigned getWideningCost(Instruction *I, unsigned VF) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
| assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated"); |
| return WideningDecisions[InstOnVF].second; |
| } |
| |
| /// Return True if instruction \p I is an optimizable truncate whose operand |
| /// is an induction variable. Such a truncate will be removed by adding a new |
| /// induction variable with the destination type. |
| bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { |
| // If the instruction is not a truncate, return false. |
| auto *Trunc = dyn_cast<TruncInst>(I); |
| if (!Trunc) |
| return false; |
| |
| // Get the source and destination types of the truncate. |
| Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); |
| Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); |
| |
| // If the truncate is free for the given types, return false. Replacing a |
| // free truncate with an induction variable would add an induction variable |
| // update instruction to each iteration of the loop. We exclude from this |
| // check the primary induction variable since it will need an update |
| // instruction regardless. |
| Value *Op = Trunc->getOperand(0); |
| if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) |
| return false; |
| |
| // If the truncated value is not an induction variable, return false. |
| return Legal->isInductionPhi(Op); |
| } |
| |
| /// Collects the instructions to scalarize for each predicated instruction in |
| /// the loop. |
| void collectInstsToScalarize(unsigned VF); |
| |
| /// Collect Uniform and Scalar values for the given \p VF. |
| /// The sets depend on CM decision for Load/Store instructions |
| /// that may be vectorized as interleave, gather-scatter or scalarized. |
| void collectUniformsAndScalars(unsigned VF) { |
| // Do the analysis once. |
| if (VF == 1 || Uniforms.count(VF)) |
| return; |
| setCostBasedWideningDecision(VF); |
| collectLoopUniforms(VF); |
| collectLoopScalars(VF); |
| } |
| |
| private: |
| /// \return An upper bound for the vectorization factor, larger than zero. |
| /// One is returned if vectorization should best be avoided due to cost. |
| unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); |
| |
| /// The vectorization cost is a combination of the cost itself and a boolean |
| /// indicating whether any of the contributing operations will actually |
| /// operate on |
| /// vector values after type legalization in the backend. If this latter value |
| /// is |
| /// false, then all operations will be scalarized (i.e. no vectorization has |
| /// actually taken place). |
| using VectorizationCostTy = std::pair<unsigned, bool>; |
| |
| /// Returns the expected execution cost. The unit of the cost does |
| /// not matter because we use the 'cost' units to compare different |
| /// vector widths. The cost that is returned is *not* normalized by |
| /// the factor width. |
| VectorizationCostTy expectedCost(unsigned VF); |
| |
| /// Returns the execution time cost of an instruction for a given vector |
| /// width. Vector width of one means scalar. |
| VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); |
| |
| /// The cost-computation logic from getInstructionCost which provides |
| /// the vector type as an output parameter. |
| unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); |
| |
| /// Calculate vectorization cost of memory instruction \p I. |
| unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for scalarized memory instruction. |
| unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for interleaving group of memory instructions. |
| unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for Gather/Scatter instruction. |
| unsigned getGatherScatterCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for widening instruction \p I with consecutive |
| /// memory access. |
| unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); |
| |
| /// The cost calculation for Load instruction \p I with uniform pointer - |
| /// scalar load + broadcast. |
| unsigned getUniformMemOpCost(Instruction *I, unsigned VF); |
| |
| /// Returns whether the instruction is a load or store and will be a emitted |
| /// as a vector operation. |
| bool isConsecutiveLoadOrStore(Instruction *I); |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p RemarkName is the identifier for the remark. \return the remark object |
| /// that can be streamed to. |
| OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { |
| return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), |
| RemarkName, TheLoop); |
| } |
| |
| /// Map of scalar integer values to the smallest bitwidth they can be legally |
| /// represented as. The vector equivalents of these values should be truncated |
| /// to this type. |
| MapVector<Instruction *, uint64_t> MinBWs; |
| |
| /// A type representing the costs for instructions if they were to be |
| /// scalarized rather than vectorized. The entries are Instruction-Cost |
| /// pairs. |
| using ScalarCostsTy = DenseMap<Instruction *, unsigned>; |
| |
| /// A set containing all BasicBlocks that are known to present after |
| /// vectorization as a predicated block. |
| SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; |
| |
| /// A map holding scalar costs for different vectorization factors. The |
| /// presence of a cost for an instruction in the mapping indicates that the |
| /// instruction will be scalarized when vectorizing with the associated |
| /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
| DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; |
| |
| /// Holds the instructions known to be uniform after vectorization. |
| /// The data is collected per VF. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; |
| |
| /// Holds the instructions known to be scalar after vectorization. |
| /// The data is collected per VF. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; |
| |
| /// Holds the instructions (address computations) that are forced to be |
| /// scalarized. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
| |
| /// Returns the expected difference in cost from scalarizing the expression |
| /// feeding a predicated instruction \p PredInst. The instructions to |
| /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
| /// non-negative return value implies the expression will be scalarized. |
| /// Currently, only single-use chains are considered for scalarization. |
| int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, |
| unsigned VF); |
| |
| /// Collect the instructions that are uniform after vectorization. An |
| /// instruction is uniform if we represent it with a single scalar value in |
| /// the vectorized loop corresponding to each vector iteration. Examples of |
| /// uniform instructions include pointer operands of consecutive or |
| /// interleaved memory accesses. Note that although uniformity implies an |
| /// instruction will be scalar, the reverse is not true. In general, a |
| /// scalarized instruction will be represented by VF scalar values in the |
| /// vectorized loop, each corresponding to an iteration of the original |
| /// scalar loop. |
| void collectLoopUniforms(unsigned VF); |
| |
| /// Collect the instructions that are scalar after vectorization. An |
| /// instruction is scalar if it is known to be uniform or will be scalarized |
| /// during vectorization. Non-uniform scalarized instructions will be |
| /// represented by VF values in the vectorized loop, each corresponding to an |
| /// iteration of the original scalar loop. |
| void collectLoopScalars(unsigned VF); |
| |
| /// Keeps cost model vectorization decision and cost for instructions. |
| /// Right now it is used for memory instructions only. |
| using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, |
| std::pair<InstWidening, unsigned>>; |
| |
| DecisionList WideningDecisions; |
| |
| public: |
| /// The loop that we evaluate. |
| Loop *TheLoop; |
| |
| /// Predicated scalar evolution analysis. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info analysis. |
| LoopInfo *LI; |
| |
| /// Vectorization legality. |
| LoopVectorizationLegality *Legal; |
| |
| /// Vector target information. |
| const TargetTransformInfo &TTI; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Demanded bits analysis. |
| DemandedBits *DB; |
| |
| /// Assumption cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| const Function *TheFunction; |
| |
| /// Loop Vectorize Hint. |
| const LoopVectorizeHints *Hints; |
| |
| /// Values to ignore in the cost model. |
| SmallPtrSet<const Value *, 16> ValuesToIgnore; |
| |
| /// Values to ignore in the cost model when VF > 1. |
| SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
| }; |
| |
| } // end anonymous namespace |
| |
| namespace llvm { |
| |
| /// InnerLoopVectorizer vectorizes loops which contain only one basic |
| /// LoopVectorizationPlanner - drives the vectorization process after having |
| /// passed Legality checks. |
| /// The planner builds and optimizes the Vectorization Plans which record the |
| /// decisions how to vectorize the given loop. In particular, represent the |
| /// control-flow of the vectorized version, the replication of instructions that |
| /// are to be scalarized, and interleave access groups. |
| class LoopVectorizationPlanner { |
| /// The loop that we evaluate. |
| Loop *OrigLoop; |
| |
| /// Loop Info analysis. |
| LoopInfo *LI; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info. |
| const TargetTransformInfo *TTI; |
| |
| /// The legality analysis. |
| LoopVectorizationLegality *Legal; |
| |
| /// The profitablity analysis. |
| LoopVectorizationCostModel &CM; |
| |
| using VPlanPtr = std::unique_ptr<VPlan>; |
| |
| SmallVector<VPlanPtr, 4> VPlans; |
| |
| /// This class is used to enable the VPlan to invoke a method of ILV. This is |
| /// needed until the method is refactored out of ILV and becomes reusable. |
| struct VPCallbackILV : public VPCallback { |
| InnerLoopVectorizer &ILV; |
| |
| VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} |
| |
| Value *getOrCreateVectorValues(Value *V, unsigned Part) override { |
| return ILV.getOrCreateVectorValue(V, Part); |
| } |
| }; |
| |
| /// A builder used to construct the current plan. |
| VPBuilder Builder; |
| |
| /// When we if-convert we need to create edge masks. We have to cache values |
| /// so that we don't end up with exponential recursion/IR. Note that |
| /// if-conversion currently takes place during VPlan-construction, so these |
| /// caches are only used at that stage. |
| using EdgeMaskCacheTy = |
| DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; |
| using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; |
| EdgeMaskCacheTy EdgeMaskCache; |
| BlockMaskCacheTy BlockMaskCache; |
| |
| unsigned BestVF = 0; |
| unsigned BestUF = 0; |
| |
| public: |
| LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, |
| LoopVectorizationLegality *Legal, |
| LoopVectorizationCostModel &CM) |
| : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} |
| |
| /// Plan how to best vectorize, return the best VF and its cost. |
| LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, |
| unsigned UserVF); |
| |
| /// Finalize the best decision and dispose of all other VPlans. |
| void setBestPlan(unsigned VF, unsigned UF); |
| |
| /// Generate the IR code for the body of the vectorized loop according to the |
| /// best selected VPlan. |
| void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); |
| |
| void printPlans(raw_ostream &O) { |
| for (const auto &Plan : VPlans) |
| O << *Plan; |
| } |
| |
| protected: |
| /// Collect the instructions from the original loop that would be trivially |
| /// dead in the vectorized loop if generated. |
| void collectTriviallyDeadInstructions( |
| SmallPtrSetImpl<Instruction *> &DeadInstructions); |
| |
| /// A range of powers-of-2 vectorization factors with fixed start and |
| /// adjustable end. The range includes start and excludes end, e.g.,: |
| /// [1, 9) = {1, 2, 4, 8} |
| struct VFRange { |
| // A power of 2. |
| const unsigned Start; |
| |
| // Need not be a power of 2. If End <= Start range is empty. |
| unsigned End; |
| }; |
| |
| /// Test a \p Predicate on a \p Range of VF's. Return the value of applying |
| /// \p Predicate on Range.Start, possibly decreasing Range.End such that the |
| /// returned value holds for the entire \p Range. |
| bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, |
| VFRange &Range); |
| |
| /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, |
| /// according to the information gathered by Legal when it checked if it is |
| /// legal to vectorize the loop. |
| void buildVPlans(unsigned MinVF, unsigned MaxVF); |
| |
| private: |
| /// A helper function that computes the predicate of the block BB, assuming |
| /// that the header block of the loop is set to True. It returns the *entry* |
| /// mask for the block BB. |
| VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); |
| |
| /// A helper function that computes the predicate of the edge between SRC |
| /// and DST. |
| VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); |
| |
| /// Check if \I belongs to an Interleave Group within the given VF \p Range, |
| /// \return true in the first returned value if so and false otherwise. |
| /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG |
| /// for \p Range.Start, and provide it as the second returned value. |
| /// Note that if \I is an adjunct member of an IG for \p Range.Start, the |
| /// \return value is <true, nullptr>, as it is handled by another recipe. |
| /// \p Range.End may be decreased to ensure same decision from \p Range.Start |
| /// to \p Range.End. |
| VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); |
| |
| // Check if \I is a memory instruction to be widened for \p Range.Start and |
| // potentially masked. Such instructions are handled by a recipe that takes an |
| // additional VPInstruction for the mask. |
| VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, |
| VFRange &Range, |
| VPlanPtr &Plan); |
| |
| /// Check if an induction recipe should be constructed for \I within the given |
| /// VF \p Range. If so build and return it. If not, return null. \p Range.End |
| /// may be decreased to ensure same decision from \p Range.Start to |
| /// \p Range.End. |
| VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, |
| VFRange &Range); |
| |
| /// Handle non-loop phi nodes. Currently all such phi nodes are turned into |
| /// a sequence of select instructions as the vectorizer currently performs |
| /// full if-conversion. |
| VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); |
| |
| /// Check if \p I can be widened within the given VF \p Range. If \p I can be |
| /// widened for \p Range.Start, check if the last recipe of \p VPBB can be |
| /// extended to include \p I or else build a new VPWidenRecipe for it and |
| /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, |
| /// false otherwise. Range.End may be decreased to ensure same decision from |
| /// \p Range.Start to \p Range.End. |
| bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); |
| |
| /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it |
| /// is predicated. \return \p VPBB augmented with this new recipe if \p I is |
| /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new |
| /// Region. Update the packing decision of predicated instructions if they |
| /// feed \p I. Range.End may be decreased to ensure same recipe behavior from |
| /// \p Range.Start to \p Range.End. |
| VPBasicBlock *handleReplication( |
| Instruction *I, VFRange &Range, VPBasicBlock *VPBB, |
| DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, |
| VPlanPtr &Plan); |
| |
| /// Create a replicating region for instruction \p I that requires |
| /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. |
| VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, |
| VPlanPtr &Plan); |
| |
| /// Build a VPlan according to the information gathered by Legal. \return a |
| /// VPlan for vectorization factors \p Range.Start and up to \p Range.End |
| /// exclusive, possibly decreasing \p Range.End. |
| VPlanPtr buildVPlan(VFRange &Range, |
| const SmallPtrSetImpl<Value *> &NeedDef); |
| }; |
| |
| } // end namespace llvm |
| |
| namespace { |
| |
| /// \brief This holds vectorization requirements that must be verified late in |
| /// the process. The requirements are set by legalize and costmodel. Once |
| /// vectorization has been determined to be possible and profitable the |
| /// requirements can be verified by looking for metadata or compiler options. |
| /// For example, some loops require FP commutativity which is only allowed if |
| /// vectorization is explicitly specified or if the fast-math compiler option |
| /// has been provided. |
| /// Late evaluation of these requirements allows helpful diagnostics to be |
| /// composed that tells the user what need to be done to vectorize the loop. For |
| /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late |
| /// evaluation should be used only when diagnostics can generated that can be |
| /// followed by a non-expert user. |
| class LoopVectorizationRequirements { |
| public: |
| LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} |
| |
| void addUnsafeAlgebraInst(Instruction *I) { |
| // First unsafe algebra instruction. |
| if (!UnsafeAlgebraInst) |
| UnsafeAlgebraInst = I; |
| } |
| |
| void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } |
| |
| bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { |
| const char *PassName = Hints.vectorizeAnalysisPassName(); |
| bool Failed = false; |
| if (UnsafeAlgebraInst && !Hints.allowReordering()) { |
| ORE.emit([&]() { |
| return OptimizationRemarkAnalysisFPCommute( |
| PassName, "CantReorderFPOps", |
| UnsafeAlgebraInst->getDebugLoc(), |
| UnsafeAlgebraInst->getParent()) |
| << "loop not vectorized: cannot prove it is safe to reorder " |
| "floating-point operations"; |
| }); |
| Failed = true; |
| } |
| |
| // Test if runtime memcheck thresholds are exceeded. |
| bool PragmaThresholdReached = |
| NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; |
| bool ThresholdReached = |
| NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; |
| if ((ThresholdReached && !Hints.allowReordering()) || |
| PragmaThresholdReached) { |
| ORE.emit([&]() { |
| return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", |
| L->getStartLoc(), |
| L->getHeader()) |
| << "loop not vectorized: cannot prove it is safe to reorder " |
| "memory operations"; |
| }); |
| DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); |
| Failed = true; |
| } |
| |
| return Failed; |
| } |
| |
| private: |
| unsigned NumRuntimePointerChecks = 0; |
| Instruction *UnsafeAlgebraInst = nullptr; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter &ORE; |
| }; |
| |
| } // end anonymous namespace |
| |
| static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { |
| if (L.empty()) { |
| if (!hasCyclesInLoopBody(L)) |
| V.push_back(&L); |
| return; |
| } |
| for (Loop *InnerL : L) |
| addAcyclicInnerLoop(*InnerL, V); |
| } |
| |
| namespace { |
| |
| /// The LoopVectorize Pass. |
| struct LoopVectorize : public FunctionPass { |
| /// Pass identification, replacement for typeid |
| static char ID; |
| |
| LoopVectorizePass Impl; |
| |
| explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) |
| : FunctionPass(ID) { |
| Impl.DisableUnrolling = NoUnrolling; |
| Impl.AlwaysVectorize = AlwaysVectorize; |
| initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnFunction(Function &F) override { |
| if (skipFunction(F)) |
| return false; |
| |
| auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
| auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
| auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; |
| auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
| auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); |
| auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); |
| auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| |
| std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
| [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; |
| |
| return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, |
| GetLAA, *ORE); |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<AssumptionCacheTracker>(); |
| AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
| AU.addRequired<DominatorTreeWrapperPass>(); |
| AU.addRequired<LoopInfoWrapperPass>(); |
| AU.addRequired<ScalarEvolutionWrapperPass>(); |
| AU.addRequired<TargetTransformInfoWrapperPass>(); |
| AU.addRequired<AAResultsWrapperPass>(); |
| AU.addRequired<LoopAccessLegacyAnalysis>(); |
| AU.addRequired<DemandedBitsWrapperPass>(); |
| AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| AU.addPreserved<LoopInfoWrapperPass>(); |
| AU.addPreserved<DominatorTreeWrapperPass>(); |
| AU.addPreserved<BasicAAWrapperPass>(); |
| AU.addPreserved<GlobalsAAWrapperPass>(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| //===----------------------------------------------------------------------===// |
| // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
| // LoopVectorizationCostModel and LoopVectorizationPlanner. |
| //===----------------------------------------------------------------------===// |
| |
| Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
| // We need to place the broadcast of invariant variables outside the loop. |
| Instruction *Instr = dyn_cast<Instruction>(V); |
| bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); |
| bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; |
| |
| // Place the code for broadcasting invariant variables in the new preheader. |
| IRBuilder<>::InsertPointGuard Guard(Builder); |
| if (Invariant) |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| |
| // Broadcast the scalar into all locations in the vector. |
| Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
| |
| return Shuf; |
| } |
| |
| void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( |
| const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { |
| Value *Start = II.getStartValue(); |
| |
| // Construct the initial value of the vector IV in the vector loop preheader |
| auto CurrIP = Builder.saveIP(); |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| if (isa<TruncInst>(EntryVal)) { |
| assert(Start->getType()->isIntegerTy() && |
| "Truncation requires an integer type"); |
| auto *TruncType = cast<IntegerType>(EntryVal->getType()); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); |
| } |
| Value *SplatStart = Builder.CreateVectorSplat(VF, Start); |
| Value *SteppedStart = |
| getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); |
| |
| // We create vector phi nodes for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (Step->getType()->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = II.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Multiply the vectorization factor by the step using integer or |
| // floating-point arithmetic as appropriate. |
| Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); |
| Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); |
| |
| // Create a vector splat to use in the induction update. |
| // |
| // FIXME: If the step is non-constant, we create the vector splat with |
| // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
| // handle a constant vector splat. |
| Value *SplatVF = isa<Constant>(Mul) |
| ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) |
| : Builder.CreateVectorSplat(VF, Mul); |
| Builder.restoreIP(CurrIP); |
| |
| // We may need to add the step a number of times, depending on the unroll |
| // factor. The last of those goes into the PHI. |
| PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", |
| &*LoopVectorBody->getFirstInsertionPt()); |
| Instruction *LastInduction = VecInd; |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); |
| |
| if (isa<TruncInst>(EntryVal)) |
| addMetadata(LastInduction, EntryVal); |
| else |
| recordVectorLoopValueForInductionCast(II, LastInduction, Part); |
| |
| LastInduction = cast<Instruction>(addFastMathFlag( |
| Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); |
| } |
| |
| // Move the last step to the end of the latch block. This ensures consistent |
| // placement of all induction updates. |
| auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); |
| auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); |
| auto *ICmp = cast<Instruction>(Br->getCondition()); |
| LastInduction->moveBefore(ICmp); |
| LastInduction->setName("vec.ind.next"); |
| |
| VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); |
| VecInd->addIncoming(LastInduction, LoopVectorLatch); |
| } |
| |
| bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { |
| return Cost->isScalarAfterVectorization(I, VF) || |
| Cost->isProfitableToScalarize(I, VF); |
| } |
| |
| bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { |
| if (shouldScalarizeInstruction(IV)) |
| return true; |
| auto isScalarInst = [&](User *U) -> bool { |
| auto *I = cast<Instruction>(U); |
| return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); |
| }; |
| return llvm::any_of(IV->users(), isScalarInst); |
| } |
| |
| void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, |
| unsigned Lane) { |
| const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
| if (Casts.empty()) |
| return; |
| // Only the first Cast instruction in the Casts vector is of interest. |
| // The rest of the Casts (if exist) have no uses outside the |
| // induction update chain itself. |
| Instruction *CastInst = *Casts.begin(); |
| if (Lane < UINT_MAX) |
| VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); |
| else |
| VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); |
| } |
| |
| void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { |
| assert((IV->getType()->isIntegerTy() || IV != OldInduction) && |
| "Primary induction variable must have an integer type"); |
| |
| auto II = Legal->getInductionVars()->find(IV); |
| assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); |
| |
| auto ID = II->second; |
| assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); |
| |
| // The scalar value to broadcast. This will be derived from the canonical |
| // induction variable. |
| Value *ScalarIV = nullptr; |
| |
| // The value from the original loop to which we are mapping the new induction |
| // variable. |
| Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; |
| |
| // True if we have vectorized the induction variable. |
| auto VectorizedIV = false; |
| |
| // Determine if we want a scalar version of the induction variable. This is |
| // true if the induction variable itself is not widened, or if it has at |
| // least one user in the loop that is not widened. |
| auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); |
| |
| // Generate code for the induction step. Note that induction steps are |
| // required to be loop-invariant |
| assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && |
| "Induction step should be loop invariant"); |
| auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
| Value *Step = nullptr; |
| if (PSE.getSE()->isSCEVable(IV->getType())) { |
| SCEVExpander Exp(*PSE.getSE(), DL, "induction"); |
| Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), |
| LoopVectorPreHeader->getTerminator()); |
| } else { |
| Step = cast<SCEVUnknown>(ID.getStep())->getValue(); |
| } |
| |
| // Try to create a new independent vector induction variable. If we can't |
| // create the phi node, we will splat the scalar induction variable in each |
| // loop iteration. |
| if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { |
| createVectorIntOrFpInductionPHI(ID, Step, EntryVal); |
| VectorizedIV = true; |
| } |
| |
| // If we haven't yet vectorized the induction variable, or if we will create |
| // a scalar one, we need to define the scalar induction variable and step |
| // values. If we were given a truncation type, truncate the canonical |
| // induction variable and step. Otherwise, derive these values from the |
| // induction descriptor. |
| if (!VectorizedIV || NeedsScalarIV) { |
| ScalarIV = Induction; |
| if (IV != OldInduction) { |
| ScalarIV = IV->getType()->isIntegerTy() |
| ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) |
| : Builder.CreateCast(Instruction::SIToFP, Induction, |
| IV->getType()); |
| ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); |
| ScalarIV->setName("offset.idx"); |
| } |
| if (Trunc) { |
| auto *TruncType = cast<IntegerType>(Trunc->getType()); |
| assert(Step->getType()->isIntegerTy() && |
| "Truncation requires an integer step"); |
| ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| } |
| } |
| |
| // If we haven't yet vectorized the induction variable, splat the scalar |
| // induction variable, and build the necessary step vectors. |
| // TODO: Don't do it unless the vectorized IV is really required. |
| if (!VectorizedIV) { |
| Value *Broadcasted = getBroadcastInstrs(ScalarIV); |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Value *EntryPart = |
| getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); |
| VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); |
| if (Trunc) |
| addMetadata(EntryPart, Trunc); |
| else |
| recordVectorLoopValueForInductionCast(ID, EntryPart, Part); |
| } |
| } |
| |
| // If an induction variable is only used for counting loop iterations or |
| // calculating addresses, it doesn't need to be widened. Create scalar steps |
| // that can be used by instructions we will later scalarize. Note that the |
| // addition of the scalar steps will not increase the number of instructions |
| // in the loop in the common case prior to InstCombine. We will be trading |
| // one vector extract for each scalar step. |
| if (NeedsScalarIV) |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID); |
| } |
| |
| Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps BinOp) { |
| // Create and check the types. |
| assert(Val->getType()->isVectorTy() && "Must be a vector"); |
| int VLen = Val->getType()->getVectorNumElements(); |
| |
| Type *STy = Val->getType()->getScalarType(); |
| assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
| "Induction Step must be an integer or FP"); |
| assert(Step->getType() == STy && "Step has wrong type"); |
| |
| SmallVector<Constant *, 8> Indices; |
| |
| if (STy->isIntegerTy()) { |
| // Create a vector of consecutive numbers from zero to VF. |
| for (int i = 0; i < VLen; ++i) |
| Indices.push_back(ConstantInt::get(STy, StartIdx + i)); |
| |
| // Add the consecutive indices to the vector value. |
| Constant *Cv = ConstantVector::get(Indices); |
| assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| assert(Step->getType() == Val->getType() && "Invalid step vec"); |
| // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
| // which can be found from the original scalar operations. |
| Step = Builder.CreateMul(Cv, Step); |
| return Builder.CreateAdd(Val, Step, "induction"); |
| } |
| |
| // Floating point induction. |
| assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
| "Binary Opcode should be specified for FP induction"); |
| // Create a vector of consecutive numbers from zero to VF. |
| for (int i = 0; i < VLen; ++i) |
| Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); |
| |
| // Add the consecutive indices to the vector value. |
| Constant *Cv = ConstantVector::get(Indices); |
| |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| |
| // Floating point operations had to be 'fast' to enable the induction. |
| FastMathFlags Flags; |
| Flags.setFast(); |
| |
| Value *MulOp = Builder.CreateFMul(Cv, Step); |
| if (isa<Instruction>(MulOp)) |
| // Have to check, MulOp may be a constant |
| cast<Instruction>(MulOp)->setFastMathFlags(Flags); |
| |
| Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); |
| if (isa<Instruction>(BOp)) |
| cast<Instruction>(BOp)->setFastMathFlags(Flags); |
| return BOp; |
| } |
| |
| void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, |
| Value *EntryVal, |
| const InductionDescriptor &ID) { |
| // We shouldn't have to build scalar steps if we aren't vectorizing. |
| assert(VF > 1 && "VF should be greater than one"); |
| |
| // Get the value type and ensure it and the step have the same integer type. |
| Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); |
| assert(ScalarIVTy == Step->getType() && |
| "Val and Step should have the same type"); |
| |
| // We build scalar steps for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (ScalarIVTy->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = ID.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Determine the number of scalars we need to generate for each unroll |
| // iteration. If EntryVal is uniform, we only need to generate the first |
| // lane. Otherwise, we generate all VF values. |
| unsigned Lanes = |
| Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 |
| : VF; |
| // Compute the scalar steps and save the results in VectorLoopValueMap. |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| for (unsigned Lane = 0; Lane < Lanes; ++Lane) { |
| auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); |
| auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); |
| auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); |
| VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); |
| recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); |
| } |
| } |
| } |
| |
| int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { |
| const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : |
| ValueToValueMap(); |
| |
| int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); |
| if (Stride == 1 || Stride == -1) |
| return Stride; |
| return 0; |
| } |
| |
| bool LoopVectorizationLegality::isUniform(Value *V) { |
| return LAI->isUniform(V); |
| } |
| |
| Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { |
|