From cca077d9bb9d2dc25deb590e062fa5d6c69a6803 Mon Sep 17 00:00:00 2001 From: Lorenzo Albano Date: Wed, 15 Feb 2023 10:25:01 +0000 Subject: [PATCH 1/4] Enable the use of Vector Predication intrinsics in the loop vectorizer. Add new VP Recipes for the Explicit Vector Length (EVL) and add support for VP memory intrinsics (vp.load, vp.store, vp.gather, vp.scatter). --- .../llvm/Analysis/TargetTransformInfo.h | 11 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 16 ++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 6 + .../Target/RISCV/RISCVTargetTransformInfo.cpp | 34 ++++ .../Target/RISCV/RISCVTargetTransformInfo.h | 3 + .../Transforms/Vectorize/LoopVectorize.cpp | 165 ++++++++++++++---- llvm/lib/Transforms/Vectorize/VPlan.cpp | 17 ++ llvm/lib/Transforms/Vectorize/VPlan.h | 101 ++++++++--- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 110 +++++++++--- .../Transforms/Vectorize/VPlanTransforms.cpp | 23 ++- .../Transforms/Vectorize/VPlanTransforms.h | 3 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../LoopVectorize/RISCV/vp_intrinsics.ll | 138 +++++++++++++++ .../Transforms/Vectorize/VPlanTest.cpp | 10 +- 15 files changed, 570 insertions(+), 84 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index e2a127ff35be..fb7bf098f860 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/IR/FMF.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -1714,6 +1715,9 @@ class TargetTransformInfo { /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; + /// @} private: @@ -2088,6 +2092,8 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const = 0; }; template @@ -2815,6 +2821,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const override { + return Impl.computeVectorLength(Builder, AVL, VF); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1d8f523e9792..4195dcaa6394 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" @@ -908,6 +909,21 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 5e7bdcdf72a4..0a9b2cfd266a 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -35,6 +35,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2558,6 +2559,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getVectorSplitCost() { return 1; } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + /// @} }; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3f76dfdaac31..b79047627d1f 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1264,6 +1264,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +Value *TargetTransformInfo::computeVectorLength(IRBuilderBase &Builder, + Value *AVL, + ElementCount VF) const { + return TTIImpl->computeVectorLength(Builder, AVL, VF); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 1a9abaea8111..8f25709d95fd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include #include using namespace llvm; @@ -1848,3 +1849,36 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.NumIVMuls, C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } + +Value *RISCVTTIImpl::computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + // Maps a VF to a (SEW, LMUL) pair. + // NOTE: we assume ELEN = 64. + const std::map> + VFToSEWLMUL = {{1, {3, 0}}, {2, {3, 1}}, {4, {3, 2}}, {8, {3, 3}}, + {16, {2, 3}}, {32, {1, 3}}, {64, {0, 3}}}; + + assert(AVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + assert(VFToSEWLMUL.find(VF.getKnownMinValue()) != VFToSEWLMUL.end() && + "Invalid value for LMUL argument."); + auto VFToSEWLMULVal = VFToSEWLMUL.at(VF.getKnownMinValue()); + + Value *AVLArg = Builder.CreateZExtOrTrunc(AVL, Builder.getInt64Ty()); + Constant *SEWArg = + ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.first); + Constant *LMULArg = + ConstantInt::get(Builder.getInt64Ty(), VFToSEWLMULVal.second); + Value *EVLRes = + Builder.CreateIntrinsic(Intrinsic::riscv_vsetvli, {AVLArg->getType()}, + {AVLArg, SEWArg, LMULArg}, nullptr, "vl"); + + // NOTE: evl type is required to be i32. + Value *EVL = Builder.CreateZExtOrTrunc(EVLRes, Builder.getInt32Ty()); + if (!VF.isScalable()) { + EVL = Builder.CreateBinaryIntrinsic( + Intrinsic::umin, + ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()), EVL); + } + return EVL; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d2592be75000..cd30f16fc6c0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -377,6 +377,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { bool shouldFoldTerminatingConditionAfterLSR() const { return true; } + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b590fb4685a3..02cf0aaef5fa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,6 +411,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; // after prolog. See `emitIterationCountCheck`. static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; +cl::opt UseVectorPredicationIntrinsics( + "use-vp-intrinsics", cl::init(false), cl::Hidden, + cl::desc("Use Vector Predication intrinsics during vectorization.")); + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -2792,6 +2796,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; + // With VP intrinsics, we require tail-folding by masking; this way, we + // operate on a number of elements equal to the original loop trip count. + if (UseVectorPredicationIntrinsics) + return VectorTripCount = getTripCount(); + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); @@ -2828,6 +2837,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. + // TODO: we should probably honor the cost model also with VP intrinsics. if (Cost->requiresScalarEpilogue(VF.isVector())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); @@ -6316,9 +6326,12 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, } bool Reverse = ConsecutiveStride < 0; - if (Reverse) + if (Reverse) { + if (UseVectorPredicationIntrinsics) + return InstructionCost::getInvalid(); Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, std::nullopt, CostKind, 0); + } return Cost; } @@ -8234,12 +8247,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Reverse || Decision == LoopVectorizationCostModel::CM_Widen; if (LoadInst *Load = dyn_cast(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, - Consecutive, Reverse); + return new VPWidenMemoryInstructionRecipe( + *Load, Operands[0], Mask, Plan->getEVLPhi(), Consecutive, Reverse); StoreInst *Store = cast(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], - Mask, Consecutive, Reverse); + Mask, Plan->getEVLPhi(), + Consecutive, Reverse); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8257,10 +8271,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, + Plan.getEVLPhi()); } assert(isa(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, + Plan.getEVLPhi()); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8692,32 +8708,64 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, - DebugLoc DL) { - Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); +static VPInstruction *addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, + bool HasNUW, DebugLoc DL, + const TargetTransformInfo *TTI) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + + // Add the EVL recipe, used to calculate the correct IV increment. + VPEVLPHIRecipe *EVLRecipe = nullptr; + // TODO: TTI should be able to indicate if a target prefers vector predication + // intrinsics. + if (UseVectorPredicationIntrinsics) { + EVLRecipe = new VPEVLPHIRecipe(Plan.getTripCount(), TTI); + Header->insert(EVLRecipe, Header->begin()); + } // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); - VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar - // IV by VF * UF. - auto *CanonicalIVIncrement = + // IV either by VF * UF or by the EVL values. + VPInstruction *CanonicalIVIncrement = nullptr; + if (EVLRecipe) + CanonicalIVIncrement = + new VPInstruction(Instruction::Add, {CanonicalIVPHI, EVLRecipe}, + {HasNUW, false}, DL, "index.next"); + else + CanonicalIVIncrement = new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, "index.next"); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + // If we are working with vector predication instrinsics, add a NextEVL + // VPInstruction to calculate the remaining elements number. + VPInstruction *NextEVL = nullptr; + if (EVLRecipe) { + NextEVL = + new VPInstruction(VPInstruction::NextEVL, + {EVLRecipe, CanonicalIVIncrement}, DL, "evl.next"); + EVLRecipe->addOperand(NextEVL); + } + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); + if (NextEVL) { + EB->appendRecipe(NextEVL); + } // Add the BranchOnCount VPInstruction to the latch. VPInstruction *BranchBack = new VPInstruction(VPInstruction::BranchOnCount, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); EB->appendRecipe(BranchBack); + + return NextEVL; } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8807,7 +8855,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // When not folding the tail, we know that the induction increment will not // overflow. bool HasNUW = Style == TailFoldingStyle::None; - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + auto *NextEVL = addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + HasNUW, DL, &TTI); // Proactively create header mask. Masks for other blocks are created on // demand. @@ -8982,7 +9031,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, - WithoutRuntimeCheck); + WithoutRuntimeCheck, NextEVL); } return Plan; } @@ -9022,7 +9071,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // is guaranteed to not wrap. bool HasNUW = true; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, - DebugLoc()); + DebugLoc(), &TTI); return Plan; } @@ -9529,7 +9578,7 @@ lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVLPart).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( + Call = cast(VBuilder.createVectorInstructionFromOpcode( Instruction::Store, Type::getVoidTy(EVLPart->getContext()), {StoredVal, Addr})); } @@ -9553,7 +9602,7 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVLPart).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( + Call = cast(VBuilder.createVectorInstructionFromOpcode( Instruction::Load, DataTy, Addr, "vp.op.load")); } Call->addParamAttr( @@ -9580,8 +9629,15 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); - bool isMaskRequired = getMask(); - if (isMaskRequired) { + VPValue *VPMask = getMask(); + VPValue *VPEVL = getEVL(); + if (VPEVL && (!VPMask || (isa(VPMask) && + dyn_cast(VPMask)->getOpcode() == + VPInstruction::ActiveLaneMask))) { + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), State.VF); + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = ConstantInt::getTrue(MaskTy); + } else if (VPMask) { // Mask reversal is only neede for non-all-one (null) masks, as reverse of a // null all-one mask is a null mask. for (unsigned Part = 0; Part < State.UF; ++Part) { @@ -9623,7 +9679,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { PartPtr = Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + Value *Increment = nullptr; + if (VPEVL) { + Increment = Builder.getInt32(0); // EVL is always an i32. + for (unsigned int P = 0; P < Part; P++) + Increment = Builder.CreateAdd(Increment, State.get(VPEVL, P)); + } else { + Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + } PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } @@ -9631,7 +9694,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { }; auto MaskValue = [&](unsigned Part) -> Value * { - if (isMaskRequired) + if (VPMask) return BlockInMaskParts[Part]; return nullptr; }; @@ -9659,10 +9722,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { StoredVal, CreateGatherScatter, MaskValue(Part), EVLPart, Alignment); } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {StoredVal, VectorGep, MaskPart, + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic(Intrinsic::vp_scatter, + {DataTy, PtrsTy}, Operands); + } else { + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } } else { if (isReverse()) { // If we store to reverse consecutive memory locations, then we need @@ -9673,11 +9745,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {StoredVal, VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic( + Intrinsic::vp_store, {DataTy, VecPtr->getType()}, Operands); + } else if (VPMask) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } State.addMetadata(NewSI, SI); } @@ -9704,21 +9782,37 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { : CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))), CreateGatherScatter, MaskValue(Part), EVLPart, Alignment); } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (VPMask || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {VectorGep, MaskPart, State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_gather, {DataTy, PtrsTy}, + Operands, nullptr, "vp.gather"); + } else { + NewLI = + Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + } State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_load, + {DataTy, VecPtr->getType()}, Operands, + nullptr, "vp.load"); + } else if (VPMask) { NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); @@ -10516,6 +10610,11 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { + assert((!UseVectorPredicationIntrinsics || + PreferPredicateOverEpilogue == + PreferPredicateTy::PredicateOrDontVectorize) && + "Tail folding required when using VP intrinsics."); + auto &LI = AM.getResult(F); // There are no loops in the function. Return before computing other expensive // analyses. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d7df9c9575a..0ac8d43acb11 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -708,6 +708,16 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } +VPEVLPHIRecipe *VPlan::getEVLPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + return cast(&R); + } + + return nullptr; +} + VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) { VPBasicBlock *Preheader = new VPBasicBlock("ph"); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); @@ -821,6 +831,13 @@ void VPlan::execute(VPTransformState *State) { } auto *PhiR = cast(&R); + if (auto *EVLPhi = dyn_cast(PhiR)) { + PHINode *Phi = EVLPhi->getPhi(); + Phi->addIncoming(State->get(EVLPhi->getBackedgeValue(), State->UF - 1), + VectorLatchBB); + continue; + } + // For canonical IV, first-order recurrences and in-order reduction phis, // only a single part is generated, which provides the last part from the // previous iteration. For non-ordered reductions all UF parts are diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0ca668abbe60..ab1d4b73aa62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -771,10 +771,10 @@ class VPRecipeBase : public ilist_node_with_parent, /// Returns the underlying instruction, if the recipe is a VPValue or nullptr /// otherwise. Instruction *getUnderlyingInstr() { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } const Instruction *getUnderlyingInstr() const { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1069,7 +1069,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue { // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, BranchOnCount, - BranchOnCond + BranchOnCond, + NextEVL }; private: @@ -1452,20 +1453,28 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { TruncInst *Trunc; const InductionDescriptor &IndDesc; + void addEVL(VPValue *EVLRecipe) { + if (EVLRecipe) + addOperand(EVLRecipe); + } + public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc) + const InductionDescriptor &IndDesc, + VPValue *EVLRecipe) : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), Trunc(nullptr), IndDesc(IndDesc) { addOperand(Step); + addEVL(EVLRecipe); } VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - TruncInst *Trunc) + TruncInst *Trunc, VPValue *EVLRecipe) : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), IV(IV), Trunc(Trunc), IndDesc(IndDesc) { addOperand(Step); + addEVL(EVLRecipe); } ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1500,6 +1509,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { VPValue *getStepValue() { return getOperand(1); } const VPValue *getStepValue() const { return getOperand(1); } + /// Return the EVL value of the current loop iteration. + VPValue *getEVL() { return getNumOperands() == 3 ? getOperand(2) : nullptr; } + const VPValue *getEVL() const { + return getNumOperands() == 3 ? getOperand(2) : nullptr; + } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { return Trunc; } @@ -1988,8 +2003,8 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue { /// A Recipe for widening load/store operations. /// The recipe uses the following VPValues: -/// - For load: Address, optional mask -/// - For store: Address, stored value, optional mask +/// - For load: Address, optional mask, optional evl +/// - For store: Address, stored value, optional mask, optional evl /// TODO: We currently execute only per-part unless a specific instance is /// provided. class VPWidenMemoryInstructionRecipe : public VPRecipeBase { @@ -2001,33 +2016,41 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { // Whether the consecutive loaded/stored addresses are in reverse order. bool Reverse; - void setMask(VPValue *Mask) { - if (!Mask) - return; - addOperand(Mask); - } + // Whether the instruction has a not all-ones mask. + bool Masked = false; + + // Whether a vector length is available to the instruction. + bool HasVL = false; - bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + void setMaskAndEVL(VPValue *Mask, VPValue *VPEVL) { + if (Mask) { + this->Masked = true; + addOperand(Mask); + } + + if (VPEVL) { + this->HasVL = true; + addOperand(VPEVL); + } } public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); new VPValue(this, &Load); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC) @@ -2040,8 +2063,15 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last operand. - return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; + return Masked ? (HasVL ? getOperand(getNumOperands() - 2) + : getOperand(getNumOperands() - 1)) + : nullptr; + } + + /// Return the evl used by this recipe. If we are working with full-length + /// vectors, return nullptr. + VPValue *getEVL() const { + return HasVL ? getOperand(getNumOperands() - 1) : nullptr; } /// Returns true if this recipe is a store. @@ -2190,6 +2220,33 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { #endif }; +class VPEVLPHIRecipe : public VPHeaderPHIRecipe { + const TargetTransformInfo *TTI; + PHINode *Phi = nullptr; + +public: + VPEVLPHIRecipe(VPValue *StartEVL, const TargetTransformInfo *TTI) + : VPHeaderPHIRecipe(VPDef::VPWidenEVLSC, nullptr, StartEVL), TTI(TTI) {} + + ~VPEVLPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC) + + PHINode *getPhi() const { return Phi; } + + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPDef::VPWidenEVLSC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating the phi node for the current index of elements, /// adjusted in accordance with EVL value. It starts at StartIV value and gets /// incremented by EVL in each iteration of the vector loop. @@ -2795,6 +2852,10 @@ class VPlan { return cast(&*EntryVPBB->begin()); } + /// Find and return the VPEVLPHIRecipe from the header - there should be only + /// one at most. If there isn't one, then return nullptr. + VPEVLPHIRecipe *getEVLPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void removeLiveOut(PHINode *PN) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5e0344a14df5..0f4cbe097064 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -38,6 +38,7 @@ using VectorParts = SmallVector; namespace llvm { extern cl::opt EnableVPlanNativePath; } +extern cl::opt UseVectorPredicationIntrinsics; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -439,6 +440,21 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::NextEVL: { + Value *Next = nullptr; + if (Part == 0) { + auto *EVLRecipe = cast(getOperand(0)); + Value *StartEVL = State.get(EVLRecipe->getOperand(0), 0); + Value *IVIncrement = State.get(getOperand(1), 0); + + Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next"); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -521,6 +537,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCount: O << "branch-on-count"; break; + case VPInstruction::NextEVL: + O << "next-evl"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -968,24 +987,27 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { MulOp = Instruction::FMul; } - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(State.VF, cast(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); + Value *SplatVF = nullptr; + if (!getEVL()) { + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it + // doesn't handle a constant vector splat. + SplatVF = isa(Mul) + ? ConstantVector::getSplat(State.VF, cast(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + } Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1000,8 +1022,26 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { if (isa(EntryVal)) State.addMetadata(LastInduction, EntryVal); - LastInduction = cast( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + if (auto *EVLRecipe = getEVL()) { + // Ensure the types match. + Type *DestTy = LastInduction->getType()->getScalarType(); + Value *EVL = State.get(EVLRecipe, Part); + if (DestTy->isIntegerTy()) { + EVL = Builder.CreateZExtOrTrunc(EVL, DestTy); + } else { + assert(DestTy->isFloatingPointTy()); + EVL = Builder.CreateUIToFP(EVL, DestTy); + } + // Multiply the EVL by the step using integer or floating-point + // arithmetic as appropriate. + Value *Mul = Builder.CreateBinOp(MulOp, Step, EVL); + Value *SplatEVL = Builder.CreateVectorSplat(State.VF, Mul); + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatEVL, "step.add.vl")); + } else { + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + } LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } @@ -1033,6 +1073,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + if (getEVL()) + return false; + // The step may be defined by a recipe in the preheader (e.g. if it requires // SCEV expansion), but for the canonical induction the step is required to be // 1, which is represented as live-in. @@ -1770,3 +1813,30 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +void VPEVLPHIRecipe::execute(VPTransformState &State) { + Value *StartEVL = State.get(getOperand(0), 0); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi"); + this->Phi->addIncoming(StartEVL, VectorPH); + + Value *PrevEVL = State.Builder.CreateZExtOrTrunc( + cast(this->Phi), State.Builder.getInt32Ty(), "evl.phi.cast"); + Value *EVL = nullptr; + for (unsigned Part = 0; Part < State.UF; Part++) { + if (EVL) + PrevEVL = State.Builder.CreateSub(PrevEVL, EVL); + EVL = TTI->computeVectorLength(State.Builder, PrevEVL, State.VF); + State.set(this, EVL, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EVL-PHI "; + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 466259cb196c..22dc894babc4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -53,7 +53,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, + Plan->getEVLPhi()); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -66,11 +67,12 @@ void VPlanTransforms::VPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/); + nullptr /*EVL*/, false /*Consecutive*/, false /*Reverse*/); } else if (StoreInst *Store = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), - nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); + nullptr /*Mask*/, nullptr /*EVL*/, false /*Consecutive*/, + false /*Reverse*/); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast(Inst)) { @@ -1040,7 +1042,8 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { // branch-on-cond %Negated // static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( - VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { + VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); @@ -1066,6 +1069,9 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // When the loop is guarded by a runtime overflow check for the loop // induction variable increment by VF, we can increment the value before // the get.active.lane mask and use the unmodified tripcount. + if (NextEVL) { + EB->insert(NextEVL, EB->end()--); + } IncrementValue = CanonicalIVIncrement; TripCount = TC; } else { @@ -1102,6 +1108,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); + if (DataAndControlFlowWithoutRuntimeCheck && NextEVL) { + EB->insert(NextEVL, EB->end()--); + } + // Replace the original terminator with BranchOnCond. We have to invert the // mask here because a true condition means jumping to the exit block. auto *NotMask = Builder.createNot(ALM, DL); @@ -1151,7 +1161,8 @@ static void replaceHeaderPredicateWithIdiom( void VPlanTransforms::addActiveLaneMask( VPlan &Plan, bool UseActiveLaneMaskForControlFlow, - bool DataAndControlFlowWithoutRuntimeCheck) { + bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL) { assert((!DataAndControlFlowWithoutRuntimeCheck || UseActiveLaneMaskForControlFlow) && "DataAndControlFlowWithoutRuntimeCheck implies " @@ -1167,7 +1178,7 @@ void VPlanTransforms::addActiveLaneMask( VPRecipeBase *LaneMask; if (UseActiveLaneMaskForControlFlow) { LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( - Plan, DataAndControlFlowWithoutRuntimeCheck); + Plan, DataAndControlFlowWithoutRuntimeCheck, NextEVL); } else { LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount()}, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a4bc7a23072c..7e1f65bc16ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -75,7 +75,8 @@ struct VPlanTransforms { /// UseActiveLaneMaskForControlFlow. static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, - bool DataAndControlFlowWithoutRuntimeCheck); + bool DataAndControlFlowWithoutRuntimeCheck, + VPInstruction *NextEVL); /// Insert truncates and extends for any truncated recipe. Redundant casts /// will be folded later. diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 22dbf7571dd9..a4db8b5c5d02 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -367,6 +367,7 @@ class VPDef { VPActiveLaneMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, + VPWidenEVLSC, VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll new file mode 100644 index 000000000000..01d20d24675d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=loop-vectorize -use-vp-intrinsics -prefer-predicate-over-epilogue=predicate-dont-vectorize -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[C1]], [[B3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP11]], i64 3, i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[VEC_IV]], i32 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP16]], i64 [[N]]) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0 +; CHECK-NEXT: [[VP_LOAD5:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd [[VP_LOAD]], [[VP_LOAD5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP22]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP21]], ptr [[TMP23]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %0 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %1 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !8 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 65d241feeab2..539701822bfb 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1036,7 +1036,8 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1131,7 +1132,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1145,8 +1147,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue Addr; VPValue Mask; VPValue StoredV; - VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, - false); + VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, + nullptr, false, false); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory()); From a2139b9a2a3428d0cee94890fc3bce6bb4e4d0f8 Mon Sep 17 00:00:00 2001 From: Lorenzo Albano Date: Thu, 23 Feb 2023 14:29:51 +0000 Subject: [PATCH 2/4] Add VectorPredication pass. This pass transforms full-length vector instructions or intrinsics calls to VP ones by recovering the (mask,evl) information from one of the memory writing VP operations and backpropagating it. --- llvm/include/llvm/IR/IntrinsicInst.h | 3 + llvm/include/llvm/IR/VectorBuilder.h | 20 +- .../Transforms/Vectorize/VectorPredication.h | 55 +++ llvm/lib/IR/IntrinsicInst.cpp | 13 + llvm/lib/IR/VectorBuilder.cpp | 23 +- llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassBuilderPipelines.cpp | 10 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/VectorPredication.cpp | 358 ++++++++++++++++++ .../VectorPredication/if-elif-else.ll | 270 +++++++++++++ .../if-elif-else_not-uniform.ll | 316 ++++++++++++++++ .../VectorPredication/if-else_scalar-cond.ll | 209 ++++++++++ .../VectorPredication/if-else_vec-cond.ll | 220 +++++++++++ .../VectorPredication/simple_vector_sum.ll | 193 ++++++++++ llvm/unittests/IR/VectorBuilderTest.cpp | 34 +- 16 files changed, 1706 insertions(+), 21 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Vectorize/VectorPredication.h create mode 100644 llvm/lib/Transforms/Vectorize/VectorPredication.cpp create mode 100644 llvm/test/Transforms/VectorPredication/if-elif-else.ll create mode 100644 llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll create mode 100644 llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll create mode 100644 llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll create mode 100644 llvm/test/Transforms/VectorPredication/simple_vector_sum.ll diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 8940bebd2c9a..560897a04052 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -564,6 +564,9 @@ class VPIntrinsic : public IntrinsicInst { /// The llvm.vp.* intrinsics for this instruction Opcode static Intrinsic::ID getForOpcode(unsigned OC); + /// The llvm.vp.* intrinsics for this intrinsic ID + static Intrinsic::ID getForIntrinsicID(Intrinsic::ID IID); + // Whether \p ID is a VP intrinsic ID. static bool isVPIntrinsic(Intrinsic::ID); diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h index 301edaed70fe..654486f210ef 100644 --- a/llvm/include/llvm/IR/VectorBuilder.h +++ b/llvm/include/llvm/IR/VectorBuilder.h @@ -57,6 +57,10 @@ class VectorBuilder { return RetType(); } + Value *createVectorInstruction(Intrinsic::ID VPID, Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); + public: VectorBuilder(IRBuilderBase &Builder, Behavior ErrorHandling = Behavior::ReportAndAbort) @@ -89,9 +93,19 @@ class VectorBuilder { // \p Opcode The functional instruction opcode of the emitted intrinsic. // \p ReturnTy The return type of the operation. // \p VecOpArray The operand list. - Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy, - ArrayRef VecOpArray, - const Twine &Name = Twine()); + Value *createVectorInstructionFromOpcode(unsigned Opcode, Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); + + // Emit a VP intrinsic call that mimics a regular intrinsic. + // This operation behaves according to the VectorBuilderBehavior. + // \p IID The functional intrinsic ID of the emitted VP intrinsic. + // \p ReturnTy The return type of the operation. + // \p VecOpArray The operand list. + Value *createVectorInstructionFromIntrinsicID(Intrinsic::ID IID, + Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); }; } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h new file mode 100644 index 000000000000..ce59854dbb95 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h @@ -0,0 +1,55 @@ +#ifndef LLVM_TRANSFORMS_VECTORPREDICATION_H +#define LLVM_TRANSFORMS_VECTORPREDICATION_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +using InstToMaskEVLMap = DenseMap>; + +struct BlockData { + // Vector that stores all vector predicated memory writing operations found in + // the basic block. If after phase 1 is empty, then the basic block can be + // skipped by following phases. + SmallVector MemoryWritingVPInstructions; + + // Store all instructions of the basic block (in the same order as they are + // found), assigning to each the list of users. Skip PHIs and terminators. + MapVector> TopologicalGraph; + + // Map each full-length vector operation eligible to be transformed to a + // vector predication one with the (mask,evl) pair of its first vector + // predicated memory writing operation user. + InstToMaskEVLMap VecOpsToTransform; + + // Ordered list representing the reverse order of how the basic block has to + // be transformed due to the new vector predicated instructions. + SmallVector NewBBReverseOrder; + + BlockData() = default; +}; + +class VectorPredicationPass : public PassInfoMixin { +private: + // List of instructions to be replaced by the new VP operations and that later + // should be removed, if possible. + DenseMap OldInstructionsToRemove; + + void analyseBasicBlock(BasicBlock &BB, BlockData &BBInfo); + void findCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + void addNewUsersToMasksAndEVLs(BasicBlock &BB, BlockData &BBInfo); + void buildNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void emitNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void transformCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + + void removeOldInstructions(); + +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + static StringRef name() { return "VectorPredicationPass"; } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORPREDICATION_H diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 7a3b708e7400..3aa33dfc2afd 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -604,6 +604,19 @@ Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) { return Intrinsic::not_intrinsic; } +Intrinsic::ID VPIntrinsic::getForIntrinsicID(Intrinsic::ID IID) { + switch (IID) { + default: + break; + +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break; +#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTR) case Intrinsic::INTR: +#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID; +#include "llvm/IR/VPIntrinsics.def" + } + return Intrinsic::not_intrinsic; +} + bool VPIntrinsic::canIgnoreVectorLengthParam() const { using namespace PatternMatch; diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index c07bc0561fba..c94bc5b180f5 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -51,13 +51,30 @@ Value &VectorBuilder::requestEVL() { return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue()); } -Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy, - ArrayRef InstOpArray, - const Twine &Name) { +Value *VectorBuilder::createVectorInstructionFromOpcode( + unsigned Opcode, Type *ReturnTy, ArrayRef InstOpArray, + const Twine &Name) { auto VPID = VPIntrinsic::getForOpcode(Opcode); if (VPID == Intrinsic::not_intrinsic) return returnWithError("No VPIntrinsic for this opcode"); + return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name); +} + +Value *VectorBuilder::createVectorInstructionFromIntrinsicID( + Intrinsic::ID IID, Type *ReturnTy, ArrayRef InstOpArray, + const Twine &Name) { + auto VPID = VPIntrinsic::getForIntrinsicID(IID); + if (VPID == Intrinsic::not_intrinsic) + return returnWithError("No VPIntrinsic for this Intrinsic"); + + return createVectorInstruction(VPID, ReturnTy, InstOpArray, Name); +} + +Value *VectorBuilder::createVectorInstruction(Intrinsic::ID VPID, + Type *ReturnTy, + ArrayRef InstOpArray, + const Twine &Name) { auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID); auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID); size_t NumInstParams = InstOpArray.size(); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f94bd422c6b5..973d6cd7d17a 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -281,6 +281,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" #include using namespace llvm; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 5c6c391049a7..82ba63b5d0ae 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -135,6 +135,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" using namespace llvm; @@ -285,6 +286,11 @@ cl::opt EnableMemProfContextDisambiguation( extern cl::opt EnableInferAlignmentPass; } // namespace llvm +static cl::opt + EnableVectorPredication("enable-vector-predication", cl::init(false), + cl::Hidden, + cl::desc("Enable VectorPredicationPass.")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -1297,6 +1303,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, /*AllowSpeculation=*/true), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + // Try to vector predicate vectorized functions. + if (EnableVectorPredication) + FPM.addPass(VectorPredicationPass()); + // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. FPM.addPass(AlignmentFromAssumptionsPass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 82ce040c6496..6ad9cb1c44de 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -420,6 +420,7 @@ FUNCTION_PASS("tsan", ThreadSanitizerPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) +FUNCTION_PASS("vector-predication", VectorPredicationPass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 9674094024b9..5574b33d9bc2 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize SLPVectorizer.cpp Vectorize.cpp VectorCombine.cpp + VectorPredication.cpp VPlan.cpp VPlanAnalysis.cpp VPlanHCFGBuilder.cpp diff --git a/llvm/lib/Transforms/Vectorize/VectorPredication.cpp b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp new file mode 100644 index 000000000000..bbebcba38e91 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp @@ -0,0 +1,358 @@ +#include "llvm/Transforms/Vectorize/VectorPredication.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/VectorBuilder.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "vector-predication" +STATISTIC(Transforms, "Number of full-length -> evl vector transformation."); + +using namespace llvm; + +// Map each instruction to its uses and save all memory writing vector +// predicated instructions found in the basic block. +void VectorPredicationPass::analyseBasicBlock(BasicBlock &BB, + BlockData &BBInfo) { + // Store all memory accessing instructions: all these instructions have to be + // chained, so that their relative order can be preserved when rewriting the + // basic block. + SmallVector ToBeChainedInstructions; + + for (Instruction &I : BB) { + if (isa(I) || I.isTerminator()) + continue; + + SmallPtrSet IUsers; + for (User *IU : I.users()) { + assert(isa(IU) && "Unexpected behaviour."); + auto *IUInst = cast(IU); + if (IUInst->getParent() != I.getParent()) + continue; + if (isa(IUInst) || IUInst->isTerminator()) + continue; + + IUsers.insert(IUInst); + } + BBInfo.TopologicalGraph.insert({&I, IUsers}); + + if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects()) + ToBeChainedInstructions.push_back(&I); + + if (auto *CI = dyn_cast(&I)) { + if (auto *CF = CI->getCalledFunction()) { + Intrinsic::ID ID = CF->getIntrinsicID(); + if (ID == Intrinsic::vp_store || ID == Intrinsic::vp_scatter) { + BBInfo.MemoryWritingVPInstructions.push_back(&I); + } + } + } + } + + if (ToBeChainedInstructions.size() > 1) { + for (unsigned I = 0; I < ToBeChainedInstructions.size() - 2; I++) { + auto *Parent = ToBeChainedInstructions[I]; + auto *Child = ToBeChainedInstructions[I + 1]; + BBInfo.TopologicalGraph[Parent].insert(Child); + } + } +} + +namespace { +void findCandidateVectorOperation(BasicBlock &BB, Value *Op, Value *Mask, + Value *EVL, BlockData &BBInfo) { + auto *OpInst = dyn_cast(Op); + if (!OpInst) + return; + + if (OpInst->getParent() != &BB) + return; + + Intrinsic::ID VPID = Intrinsic::not_intrinsic; + unsigned Opcode = OpInst->getOpcode(); + if (Opcode == Instruction::Call) { + if (auto *CF = cast(OpInst)->getCalledFunction()) + VPID = VPIntrinsic::getForIntrinsicID(CF->getIntrinsicID()); + } else + VPID = VPIntrinsic::getForOpcode(OpInst->getOpcode()); + if (VPID == Intrinsic::not_intrinsic) + return; + + // If the instruction is already present in the map, it means it was already + // visited starting from a previous memory writting vp operation. + if (!BBInfo.VecOpsToTransform + .insert(std::make_pair(OpInst, std::make_pair(Mask, EVL))) + .second) { + // We need to check if new mask and evl values differ from the old ones: + // - if they are the same, then there is nothing to do; + // - if only the mask differ, we use an allones mask; + // - otherwise, we remove the instruction from the map (i.e., no + // transformation should happen) + // NOTE: maybe, instead of giving up, we could split case 3 in two + // more cases: if only EVLs differs, we use VLMAX with the mask; if both + // mask and EVL differ, we use an allones mask and VLMAX (even if + // semantically it means not doing anything). + auto It = BBInfo.VecOpsToTransform.find(OpInst); + assert(It != BBInfo.VecOpsToTransform.end()); + Value *OldMask, *OldEVL; + std::tie(OldMask, OldEVL) = It->second; + + if (Mask == OldMask && EVL == OldEVL) + return; + + BBInfo.VecOpsToTransform.erase(OpInst); + if (EVL == OldEVL) { + BBInfo.VecOpsToTransform.insert( + std::make_pair(OpInst, std::make_pair(nullptr, EVL))); + } + } + + // Recursively visit OpInst operands. + switch (VPID) { + default: + for (auto *OpVal : OpInst->operand_values()) + findCandidateVectorOperation(BB, OpVal, Mask, EVL, BBInfo); + break; + case Intrinsic::vp_select: { + auto CanBackPropagateCondOpAsMask = [&](Value *CondOp) -> bool { + if (!CondOp->getType()->isVectorTy()) + return false; + + auto *CondInstr = dyn_cast(CondOp); + if (!CondInstr) + return false; + if (CondInstr->getParent() != &BB) + return false; + if (auto *ALM = dyn_cast(CondInstr); + ALM && ALM->getCalledFunction()->getIntrinsicID() == + Intrinsic::get_active_lane_mask) + return false; + + return true; + }; + + Value *Cond = OpInst->getOperand(0); + Value *TrueOp = OpInst->getOperand(1); + Value *FalseOp = OpInst->getOperand(2); + // If the condition argument is a vector, we backpropagate it as mask + // for the true branch and its negation as mask for the false one. + if (CanBackPropagateCondOpAsMask(Cond)) { + auto *CondInstr = cast(Cond); + IRBuilder<> Builder(CondInstr); + auto *CondNot = cast(Builder.CreateNot(Cond)); + SmallPtrSet CondNotUsers; + BBInfo.TopologicalGraph.insert({CondNot, CondNotUsers}); + BBInfo.TopologicalGraph[CondInstr].insert(CondNot); + + findCandidateVectorOperation(BB, Cond, nullptr, EVL, BBInfo); + findCandidateVectorOperation(BB, CondNot, nullptr, EVL, BBInfo); + + findCandidateVectorOperation(BB, TrueOp, Cond, EVL, BBInfo); + findCandidateVectorOperation(BB, FalseOp, CondNot, EVL, BBInfo); + } else { + findCandidateVectorOperation(BB, TrueOp, nullptr, EVL, BBInfo); + findCandidateVectorOperation(BB, FalseOp, nullptr, EVL, BBInfo); + } + break; + } + } +} +} // namespace + +// For each vector predicated memory writing operation of the basic block, go +// back to the stored vector defining instruction and verify it is a vector +// operation. Add it to the list of instructions to be transformed into vector +// predicated ones, then recursively repeat the process for its vector +// arguments. +void VectorPredicationPass::findCandidateVectorOperations(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.MemoryWritingVPInstructions.empty()) + return; + + for (Instruction *I : BBInfo.MemoryWritingVPInstructions) { + assert(I->getParent() == &BB && "This is not the right basic block"); + auto *VPI = cast(I); + Value *StoredOperand = VPI->getMemoryDataParam(); + Value *MaskOperand = VPI->getMaskParam(); + Value *EVLOperand = VPI->getVectorLengthParam(); + // First, visit the mask operand (assigning an allones mask to this branch) + // and only then visit the stored operand. + findCandidateVectorOperation(BB, MaskOperand, nullptr, EVLOperand, BBInfo); + findCandidateVectorOperation(BB, StoredOperand, MaskOperand, EVLOperand, + BBInfo); + } +} + +// Add the candidates as users of the mask and of the evl linked to each of +// them, but only if they belong to the same basic block. +void VectorPredicationPass::addNewUsersToMasksAndEVLs(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [K, V] : BBInfo.VecOpsToTransform) { + if (auto *MaskInst = dyn_cast_if_present(V.first); + MaskInst && MaskInst->getParent() == &BB) + BBInfo.TopologicalGraph[MaskInst].insert(K); + if (auto *EVLInst = dyn_cast(V.second); + EVLInst && EVLInst->getParent() == &BB) + BBInfo.TopologicalGraph[EVLInst].insert(K); + } +} + +// Topologically sort, preserving as much as possible the original order. +void VectorPredicationPass::buildNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + while (!BBInfo.TopologicalGraph.empty()) { + Instruction *Inst = nullptr; + for (auto B = BBInfo.TopologicalGraph.rbegin(), + E = BBInfo.TopologicalGraph.rend(); + B != E; B++) { + if (B->second.empty()) { + Inst = B->first; + break; + } + } + assert(Inst && "Failed to empty topological graph!"); + + BBInfo.NewBBReverseOrder.push_back(Inst); + BBInfo.TopologicalGraph.erase(Inst); + + for (auto B = BBInfo.TopologicalGraph.begin(), + E = BBInfo.TopologicalGraph.end(); + B != E; B++) { + B->second.erase(Inst); + } + } +} + +// Modify the basic block based on the topological order generated. +void VectorPredicationPass::emitNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + Instruction *InsertPoint = BB.getTerminator(); + for (Instruction *I : BBInfo.NewBBReverseOrder) { + I->moveBefore(InsertPoint); + InsertPoint = I; + } +} + +// Transform candidates to vector predicated instructions. +void VectorPredicationPass::transformCandidateVectorOperations( + BasicBlock &BB, BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [I, P] : BBInfo.VecOpsToTransform) { + Value *Mask, *EVL; + std::tie(Mask, EVL) = P; + + IRBuilder<> Builder(I); + unsigned int OpcodeOrIID = I->getOpcode(); + Type *RetTy = I->getType(); + SmallVector Operands(I->value_op_begin(), I->value_op_end()); + bool IsCall = false; + switch (OpcodeOrIID) { + case Instruction::Call: { + Operands.clear(); + auto *CI = cast(I); + for (auto &Op : CI->operands()) { + if (Op == CI->getCalledOperand()) + continue; + Operands.push_back(Op.get()); + } + OpcodeOrIID = CI->getCalledFunction()->getIntrinsicID(); + IsCall = true; + break; + } + case Instruction::FCmp: + case Instruction::ICmp: { + Operands.clear(); + auto *CmpI = cast(I); + Value *PredOp = MetadataAsValue::get( + Builder.getContext(), + MDString::get(Builder.getContext(), + CmpInst::getPredicateName(CmpI->getPredicate()))); + Operands = {CmpI->getOperand(0), CmpI->getOperand(1), PredOp}; + break; + } + case Instruction::Select: { + if (!I->getOperand(0)->getType()->isVectorTy()) { + Operands.clear(); + Value *Op1 = I->getOperand(1); + Value *Op2 = I->getOperand(2); + Value *Cond = Builder.CreateVectorSplat( + cast(Op1->getType())->getElementCount(), + I->getOperand(0), "select.cond.splat"); + Operands = {Cond, Op1, Op2}; + } else if (auto *ALM = dyn_cast(I->getOperand(0)); + ALM && ALM->getCalledFunction()->getIntrinsicID() == + Intrinsic::get_active_lane_mask) { + // Ignore the select: the vector length operand already takes care of + // keeping track of the active elements. + I->replaceAllUsesWith(I->getOperand(1)); + OldInstructionsToRemove.insert(std::make_pair(I, nullptr)); + + continue; + } + break; + } + default: + break; + } + + if (!Mask) + // nullptr means unmasked operation, hence we use an all-ones mask. + Mask = ConstantInt::getTrue(RetTy->getWithNewType(Builder.getInt1Ty())); + + VectorBuilder VecBuilder(Builder); + VecBuilder.setMask(Mask).setEVL(EVL); + Value *NewVPOp = nullptr; + if (IsCall) + NewVPOp = VecBuilder.createVectorInstructionFromIntrinsicID( + OpcodeOrIID, RetTy, Operands, "vp.op"); + else + NewVPOp = VecBuilder.createVectorInstructionFromOpcode(OpcodeOrIID, RetTy, + Operands, "vp.op"); + + Transforms++; // Stats + OldInstructionsToRemove.insert(std::make_pair(I, NewVPOp)); + } +} + +// Remove old instructions, if possible. +void VectorPredicationPass::removeOldInstructions() { + for (auto [I, NewVPOp] : OldInstructionsToRemove) { + if (NewVPOp) + I->replaceAllUsesWith(NewVPOp); + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + } +} + +PreservedAnalyses VectorPredicationPass::run(Function &F, + FunctionAnalysisManager &AM) { + assert(OldInstructionsToRemove.empty() && + "Map should be cleared at the end of each run of the pass."); + + for (BasicBlock &BB : F) { + BlockData BBInfo; + + analyseBasicBlock(BB, BBInfo); + findCandidateVectorOperations(BB, BBInfo); + addNewUsersToMasksAndEVLs(BB, BBInfo); + buildNewBasicBlockSchedule(BB, BBInfo); + emitNewBasicBlockSchedule(BB, BBInfo); + transformCandidateVectorOperations(BB, BBInfo); + } + + removeOldInstructions(); + OldInstructionsToRemove.clear(); + + // TODO: think about which analysis are preserved. + return PreservedAnalyses::none(); +} diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else.ll b/llvm/test/Transforms/VectorPredication/if-elif-else.ll new file mode 100644 index 000000000000..8241f17102c4 --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-elif-else.ll @@ -0,0 +1,270 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-elif-else.c' +source_filename = "custom/if-elif-else.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else if (N > 75) +; C[I] = A[I] * B[I]; +; else +; C[I] = 2 * A[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP30:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP30]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[CMP4:%.*]] = icmp ugt i64 [[N]], 75 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP32:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP33:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP32]], [[C]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND034:%.*]] = icmp ugt ptr [[UGLYGEP33]], [[C]] +; CHECK-NEXT: [[BOUND135:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT36]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector [[BROADCAST_SPLATINSERT37]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT39:%.*]] = insertelement poison, i1 [[CMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT40:%.*]] = shufflevector [[BROADCAST_SPLATINSERT39]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT40]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = select [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[BROADCAST_SPLAT40]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP8]], i64 3, i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4:![0-9]+]], !alias.scope !8 +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD41:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[TMP6]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD41]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_LOAD42:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[BROADCAST_SPLAT38]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD42]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP7]], [[VP_OP1]], [[VP_OP]], i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP6]], [[VP_OP2]], [[VP_OP4]], i32 [[TMP9]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP3]], ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END_LOOPEXIT44:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_031:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_031]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: br i1 [[CMP4]], label [[IF_THEN5:%.*]], label [[IF_ELSE9:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP14]], [[TMP16]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else9: +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP14]], 2.000000e+00 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL11]], [[IF_ELSE9]] ], [ [[MUL]], [[IF_THEN5]] ] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_031]] +; CHECK-NEXT: store double [[ADD_SINK]], ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_031]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit44: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp30 = icmp sgt i64 %N, 0 + br i1 %cmp30, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %cmp4 = icmp ugt i64 %N, 75 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = shl i64 %N, 3 + %uglygep = getelementptr i8, ptr %C, i64 %4 + %uglygep32 = getelementptr i8, ptr %A, i64 %4 + %uglygep33 = getelementptr i8, ptr %B, i64 %4 + %bound0 = icmp ugt ptr %uglygep32, %C + %bound1 = icmp ugt ptr %uglygep, %A + %found.conflict = and i1 %bound0, %bound1 + %bound034 = icmp ugt ptr %uglygep33, %C + %bound135 = icmp ugt ptr %uglygep, %B + %found.conflict36 = and i1 %bound034, %bound135 + %conflict.rdx = or i1 %found.conflict, %found.conflict36 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %broadcast.splatinsert37 = insertelement poison, i1 %cmp1, i64 0 + %broadcast.splat38 = shufflevector %broadcast.splatinsert37, poison, zeroinitializer + %broadcast.splatinsert39 = insertelement poison, i1 %cmp4, i64 0 + %broadcast.splat40 = shufflevector %broadcast.splatinsert39, poison, zeroinitializer + %5 = xor %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %6 = select %5, %broadcast.splat40, zeroinitializer + %7 = select %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), %broadcast.splat40 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %8 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %8, i64 3, i64 0) + %9 = trunc i64 %vl to i32 + %10 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %10, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !8 + %11 = fmul %vp.load, shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer) + %12 = getelementptr double, ptr %B, i64 %index + %vp.load41 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %6, i32 %9), !tbaa !4, !alias.scope !11 + %13 = fmul %vp.load, %vp.load41 + %vp.load42 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %broadcast.splat38, i32 %9), !tbaa !4, !alias.scope !11 + %14 = fadd %vp.load, %vp.load42 + %predphi = select %7, %14, %11 + %predphi43 = select %6, %13, %predphi + %15 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %predphi43, ptr %15, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !13, !noalias !15 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %16 = icmp eq i64 %index.next, %N + br i1 %16, label %for.end.loopexit44, label %vector.body, !llvm.loop !16 + +for.body: ; preds = %for.body.preheader, %for.inc + %I.031 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.031 + %17 = load double, ptr %arrayidx, align 8, !tbaa !4 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.031 + %18 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %17, %18 + br label %for.inc + +if.else: ; preds = %for.body + br i1 %cmp4, label %if.then5, label %if.else9 + +if.then5: ; preds = %if.else + %arrayidx7 = getelementptr inbounds double, ptr %B, i64 %I.031 + %19 = load double, ptr %arrayidx7, align 8, !tbaa !4 + %mul = fmul double %17, %19 + br label %for.inc + +if.else9: ; preds = %if.else + %mul11 = fmul double %17, 2.000000e+00 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else9, %if.then5 + %add.sink = phi double [ %add, %if.then ], [ %mul11, %if.else9 ], [ %mul, %if.then5 ] + %arrayidx3 = getelementptr inbounds double, ptr %C, i64 %I.031 + store double %add.sink, ptr %arrayidx3, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.031, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !20 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end.loopexit44: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit44, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9} +!9 = distinct !{!9, !10} +!10 = distinct !{!10, !"LVerDomain"} +!11 = !{!12} +!12 = distinct !{!12, !10} +!13 = !{!14} +!14 = distinct !{!14, !10} +!15 = !{!9, !12} +!16 = distinct !{!16, !17, !18, !19} +!17 = !{!"llvm.loop.mustprogress"} +!18 = !{!"llvm.loop.isvectorized", i32 1} +!19 = !{!"llvm.loop.unroll.runtime.disable"} +!20 = distinct !{!20, !17, !18} diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll new file mode 100644 index 000000000000..071c42c5ed6b --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-elif-else_not-uniform.ll @@ -0,0 +1,316 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'if-elif-else_not-uniform.c' +source_filename = "if-elif-else_not-uniform.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B, double *K) { +; long I; +; for (I = 0; I < N; I++) { +; if (K[I] < 50) +; C[I] = A[I] + B[I]; +; else if (K[I] > 75) +; C[I] = A[I] * B[I]; +; else +; C[I] = 2 * A[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %K) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @addVec +; CHECK-SAME: (i64 noundef [[N:%.*]], ptr nocapture noundef writeonly [[C:%.*]], ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef readonly [[K:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP33:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP33]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP3]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY_PREHEADER50:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader50: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP35:%.*]] = getelementptr i8, ptr [[K]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP36:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP35]], [[C]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[K]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND038:%.*]] = icmp ugt ptr [[SCEVGEP36]], [[C]] +; CHECK-NEXT: [[BOUND139:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT40:%.*]] = and i1 [[BOUND038]], [[BOUND139]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT40]] +; CHECK-NEXT: [[BOUND041:%.*]] = icmp ugt ptr [[SCEVGEP37]], [[C]] +; CHECK-NEXT: [[BOUND142:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT43:%.*]] = and i1 [[BOUND041]], [[BOUND142]] +; CHECK-NEXT: [[CONFLICT_RDX44:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT43]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX44]], label [[FOR_BODY_PREHEADER50]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP6:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP6]], i64 3, i64 1) +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[N]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]), !tbaa [[TBAA7:![0-9]+]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP7:%.*]] = call @llvm.vp.fcmp.nxv2f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 5.000000e+01, i64 0), poison, zeroinitializer), metadata !"olt", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[VP_OP3]], zeroinitializer +; CHECK-NEXT: [[VP_OP16:%.*]] = call @llvm.vp.fcmp.nxv2f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 7.500000e+01, i64 0), poison, zeroinitializer), metadata !"ogt", [[TMP9]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD45:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], [[VP_OP3]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14 +; CHECK-NEXT: [[VP_OP8:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP9]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP12:%.*]] = call @llvm.vp.select.nxv2i1( [[VP_OP3]], [[VP_OP8]], zeroinitializer, i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP14:%.*]] = call @llvm.vp.fmul.nxv2f64( [[VP_LOAD45]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer), [[VP_OP12]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.select.nxv2i1( [[VP_OP3]], [[VP_OP16]], zeroinitializer, i32 [[TMP7]]) +; CHECK-NEXT: [[VP_LOAD46:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], [[VP_OP5]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16 +; CHECK-NEXT: [[VP_OP11:%.*]] = call @llvm.vp.fmul.nxv2f64( [[VP_LOAD45]], [[VP_LOAD46]], [[VP_OP5]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_LOAD47:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP10]], [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !14 +; CHECK-NEXT: [[VP_LOAD48:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP11]], [[VP_OP7]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !16 +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP6:%.*]] = call @llvm.vp.fadd.nxv2f64( [[VP_LOAD47]], [[VP_LOAD48]], [[VP_OP4]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP13:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP15:%.*]] = call @llvm.vp.select.nxv2f64( [[VP_OP12]], [[VP_OP14]], [[VP_OP6]], i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.or.nxv2i1( [[VP_OP7]], [[VP_OP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP9:%.*]] = call @llvm.vp.or.nxv2i1( [[VP_OP2]], [[VP_OP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.select.nxv2f64( [[VP_OP5]], [[VP_OP11]], [[VP_OP15]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[VP_OP]], ptr [[TMP12]], [[VP_OP9]], i32 [[TMP7]]), !tbaa [[TBAA7]], !alias.scope !18, !noalias !20 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: [[VP_OP10:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.xor.nxv2i1( [[VP_OP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END_LOOPEXIT51:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_034:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER50]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[K]], i64 [[I_034]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt double [[TMP14]], 5.000000e+01 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: [[CMP6:%.*]] = fcmp ogt double [[TMP14]], 7.500000e+01 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_034]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX8]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[IF_ELSE11:%.*]] +; CHECK: if.then7: +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_034]] +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[ARRAYIDX9]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP17]], [[TMP18]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else11: +; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[TMP17]], 2.000000e+00 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL13]], [[IF_ELSE11]] ], [ [[MUL]], [[IF_THEN7]] ] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_034]] +; CHECK-NEXT: store double [[ADD_SINK]], ptr [[ARRAYIDX4]], align 8, !tbaa [[TBAA7]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_034]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit51: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp33 = icmp sgt i64 %N, 0 + br i1 %cmp33, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %1, 1 + %3 = call i64 @llvm.umax.i64(i64 %2, i64 12) + %4 = icmp ugt i64 %3, %0 + br i1 %4, label %for.body.preheader50, label %vector.memcheck + +for.body.preheader50: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %5 = shl i64 %N, 3 + %scevgep = getelementptr i8, ptr %C, i64 %5 + %scevgep35 = getelementptr i8, ptr %K, i64 %5 + %scevgep36 = getelementptr i8, ptr %A, i64 %5 + %scevgep37 = getelementptr i8, ptr %B, i64 %5 + %bound0 = icmp ugt ptr %scevgep35, %C + %bound1 = icmp ugt ptr %scevgep, %K + %found.conflict = and i1 %bound0, %bound1 + %bound038 = icmp ugt ptr %scevgep36, %C + %bound139 = icmp ugt ptr %scevgep, %A + %found.conflict40 = and i1 %bound038, %bound139 + %conflict.rdx = or i1 %found.conflict, %found.conflict40 + %bound041 = icmp ugt ptr %scevgep37, %C + %bound142 = icmp ugt ptr %scevgep, %B + %found.conflict43 = and i1 %bound041, %bound142 + %conflict.rdx44 = or i1 %conflict.rdx, %found.conflict43 + br i1 %conflict.rdx44, label %for.body.preheader50, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %6 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %6, i64 3, i64 1) + %7 = trunc i64 %vl to i32 + %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index, i64 %N) + %8 = getelementptr inbounds double, ptr %K, i64 %index + %vp.load = call @llvm.vp.load.nxv2f64.p0(ptr %8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %7), !tbaa !7, !alias.scope !11 + %9 = fcmp olt %vp.load, shufflevector ( insertelement ( poison, double 5.000000e+01, i64 0), poison, zeroinitializer) + %10 = fcmp ogt %vp.load, shufflevector ( insertelement ( poison, double 7.500000e+01, i64 0), poison, zeroinitializer) + %11 = getelementptr double, ptr %A, i64 %index + %12 = xor %9, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %13 = select %active.lane.mask, %12, zeroinitializer + %vp.load45 = call @llvm.vp.load.nxv2f64.p0(ptr %11, %13, i32 %7), !tbaa !7, !alias.scope !14 + %14 = fmul %vp.load45, shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer) + %15 = getelementptr double, ptr %B, i64 %index + %16 = select %13, %10, zeroinitializer + %vp.load46 = call @llvm.vp.load.nxv2f64.p0(ptr %15, %16, i32 %7), !tbaa !7, !alias.scope !16 + %17 = fmul %vp.load45, %vp.load46 + %18 = select %active.lane.mask, %9, zeroinitializer + %vp.load47 = call @llvm.vp.load.nxv2f64.p0(ptr %11, %18, i32 %7), !tbaa !7, !alias.scope !14 + %vp.load48 = call @llvm.vp.load.nxv2f64.p0(ptr %15, %18, i32 %7), !tbaa !7, !alias.scope !16 + %19 = fadd %vp.load47, %vp.load48 + %20 = xor %10, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %21 = select %13, %20, zeroinitializer + %predphi = select %21, %14, %19 + %predphi49 = select %16, %17, %predphi + %22 = getelementptr inbounds double, ptr %C, i64 %index + %23 = or %18, %21 + %24 = or %23, %16 + call void @llvm.vp.store.nxv2f64.p0( %predphi49, ptr %22, %24, i32 %7), !tbaa !7, !alias.scope !18, !noalias !20 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %25 = icmp eq i64 %index.next, %N + br i1 %25, label %for.end.loopexit51, label %vector.body, !llvm.loop !21 + +for.body: ; preds = %for.body.preheader50, %for.inc + %I.034 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader50 ] + %arrayidx = getelementptr inbounds double, ptr %K, i64 %I.034 + %26 = load double, ptr %arrayidx, align 8, !tbaa !7 + %cmp1 = fcmp olt double %26, 5.000000e+01 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %I.034 + %27 = load double, ptr %arrayidx2, align 8, !tbaa !7 + %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %I.034 + %28 = load double, ptr %arrayidx3, align 8, !tbaa !7 + %add = fadd double %27, %28 + br label %for.inc + +if.else: ; preds = %for.body + %cmp6 = fcmp ogt double %26, 7.500000e+01 + %arrayidx8 = getelementptr inbounds double, ptr %A, i64 %I.034 + %29 = load double, ptr %arrayidx8, align 8, !tbaa !7 + br i1 %cmp6, label %if.then7, label %if.else11 + +if.then7: ; preds = %if.else + %arrayidx9 = getelementptr inbounds double, ptr %B, i64 %I.034 + %30 = load double, ptr %arrayidx9, align 8, !tbaa !7 + %mul = fmul double %29, %30 + br label %for.inc + +if.else11: ; preds = %if.else + %mul13 = fmul double %29, 2.000000e+00 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else11, %if.then7 + %add.sink = phi double [ %add, %if.then ], [ %mul13, %if.else11 ], [ %mul, %if.then7 ] + %arrayidx4 = getelementptr inbounds double, ptr %C, i64 %I.034 + store double %add.sink, ptr %arrayidx4, align 8, !tbaa !7 + %inc = add nuw nsw i64 %I.034, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !25 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end.loopexit51: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit51, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv2i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv2f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv2f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-smaia,-experimental-ssaia,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zcmt,-experimental-zfa,-experimental-zicond,-experimental-zihintntl,-experimental-ztso,-experimental-zvbb,-experimental-zvbc,-experimental-zvfh,-experimental-zvkg,-experimental-zvkn,-experimental-zvkned,-experimental-zvkng,-experimental-zvknha,-experimental-zvknhb,-experimental-zvks,-experimental-zvksed,-experimental-zvksg,-experimental-zvksh,-experimental-zvkt,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xsfvcp,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zicntr,-zihintpause,-zihpm,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5} +!llvm.ident = !{!6} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"PIE Level", i32 2} +!4 = !{i32 7, !"uwtable", i32 2} +!5 = !{i32 8, !"SmallDataLimit", i32 8} +!6 = !{!"clang version 17.0.0"} +!7 = !{!8, !8, i64 0} +!8 = !{!"double", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!12} +!12 = distinct !{!12, !13} +!13 = distinct !{!13, !"LVerDomain"} +!14 = !{!15} +!15 = distinct !{!15, !13} +!16 = !{!17} +!17 = distinct !{!17, !13} +!18 = !{!19} +!19 = distinct !{!19, !13} +!20 = !{!12, !15, !17} +!21 = distinct !{!21, !22, !23, !24} +!22 = !{!"llvm.loop.mustprogress"} +!23 = !{!"llvm.loop.isvectorized", i32 1} +!24 = !{!"llvm.loop.unroll.runtime.disable"} +!25 = distinct !{!25, !22, !23} diff --git a/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll new file mode 100644 index 000000000000..ed8f28feeffc --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else2.c' +source_filename = "custom/if-else2.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLATINSERT:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLAT:%.*]] = shufflevector [[SELECT_COND_SPLAT_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.select.nxv1f64( [[SELECT_COND_SPLAT_SPLAT]], [[VP_OP]], [[VP_OP2]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP1]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT25:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP17]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit25: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load24 + %14 = fmul %vp.load, %vp.load24 + %15 = select i1 %cmp1, %13, %14 + %16 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %15, ptr %16, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %17 = icmp eq i64 %index.next, %N + br i1 %17, label %for.end.loopexit25, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %18 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %19 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %18, %19 + %mul = fmul double %18, %19 + %mul.sink = select i1 %cmp1, double %add, double %mul + %20 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %20, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit25: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit25, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll new file mode 100644 index 000000000000..34e4c63c12af --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else1.c' +source_filename = "custom/if-else1.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (I < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER25:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader25: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER25]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP10]], i64 3, i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[VL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.icmp.nxv1i64( [[VEC_IND]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer), metadata !"ult", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], [[VP_OP2]], i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.xor.nxv1i1( [[VP_OP2]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], [[VP_OP4]], i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.select.nxv1f64( [[VP_OP2]], [[VP_OP3]], [[VP_OP1]], i32 [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP]], ptr [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_END_LOOPEXIT26:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER25]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[I_019]], 50 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP18]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit26: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader25, label %vector.memcheck + +for.body.preheader25: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader25, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %9 = call @llvm.experimental.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %vec.ind = phi [ %9, %vector.ph ], [ %vec.ind.next, %vector.body ] + %10 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %10, i64 3, i64 0) + %11 = trunc i64 %vl to i32 + %.splatinsert = insertelement poison, i64 %vl, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %12 = icmp ult %vec.ind, shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) + %13 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %13, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %14 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %15 = fadd %vp.load, %vp.load24 + %16 = fmul %vp.load, %vp.load24 + %17 = select %12, %15, %16 + %18 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %17, ptr %18, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %vec.ind.next = add %vec.ind, %.splat + %19 = icmp eq i64 %index.next, %N + br i1 %19, label %for.end.loopexit26, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader25, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader25 ] + %cmp1 = icmp ult i64 %I.019, 50 + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %20 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %21 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %20, %21 + %mul = fmul double %20, %21 + %mul.sink = select i1 %cmp1, double %add, double %mul + %22 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %22, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit26: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit26, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll new file mode 100644 index 000000000000..116d883572ee --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B11:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A10:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C9:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER14:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader14: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C9]], [[A10]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C9]], [[B11]] +; CHECK-NEXT: [[DIFF_CHECK12:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK12]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER14]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD13:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT15:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER14]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit15: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B11 = ptrtoint ptr %B to i64 + %A10 = ptrtoint ptr %A to i64 + %C9 = ptrtoint ptr %C to i64 + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader14, label %vector.memcheck + +for.body.preheader14: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C9, %A10 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C9, %B11 + %diff.check12 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check12 + br i1 %conflict.rdx, label %for.body.preheader14, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load13 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load13 + %14 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %13, ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %15 = icmp eq i64 %index.next, %N + br i1 %15, label %for.end.loopexit15, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader14, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader14 ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %16 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %17 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %16, %17 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit15: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit15, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp index 4f9e9d7c494d..7b0109a77b3e 100644 --- a/llvm/unittests/IR/VectorBuilderTest.cpp +++ b/llvm/unittests/IR/VectorBuilderTest.cpp @@ -66,8 +66,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ auto *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -116,8 +116,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -162,8 +162,8 @@ TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) { bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -197,8 +197,8 @@ TEST_F(VectorBuilderTest, bool IsFP = (#INSTCLASS)[0] == 'F'; \ Type *ValueTy = IsFP ? FloatVecTy : IntVecTy; \ Value *Op = UndefValue::get(ValueTy); \ - auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy, \ - {Op, Op}); \ + auto *I = VBuild.createVectorInstructionFromOpcode(Instruction::OPCODE, \ + ValueTy, {Op, Op}); \ ASSERT_TRUE(isa(I)); \ auto *VPIntrin = cast(I); \ ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID); \ @@ -227,8 +227,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) { // vp.load auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load); - auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load, - FloatVecTy, {FloatVecPtr}); + auto *LoadIntrin = VBuild.createVectorInstructionFromOpcode( + Instruction::Load, FloatVecTy, {FloatVecPtr}); ASSERT_TRUE(isa(LoadIntrin)); auto *VPLoad = cast(LoadIntrin); ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID); @@ -237,8 +237,8 @@ TEST_F(VectorBuilderTest, TestCreateLoadStore) { // vp.store auto *VoidTy = Builder.getVoidTy(); auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store); - auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy, - {FloatVec, FloatVecPtr}); + auto *StoreIntrin = VBuild.createVectorInstructionFromOpcode( + Instruction::Store, VoidTy, {FloatVec, FloatVecPtr}); ASSERT_TRUE(isa(LoadIntrin)); auto *VPStore = cast(StoreIntrin); ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID); @@ -257,7 +257,8 @@ TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) { auto *VoidTy = Builder.getVoidTy(); VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone); VBuild.setMask(Mask).setEVL(EVL); - auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); + auto *Val = + VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {}); ASSERT_EQ(Val, nullptr); } @@ -272,8 +273,11 @@ TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) { auto *VoidTy = Builder.getVoidTy(); VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort); VBuild.setMask(Mask).setEVL(EVL); - ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); }, - "No VPIntrinsic for this opcode"); + ASSERT_DEATH( + { + VBuild.createVectorInstructionFromOpcode(Instruction::Br, VoidTy, {}); + }, + "No VPIntrinsic for this opcode"); } } // end anonymous namespace From f89fe470ebab83be9887c8a794c2b8fecc9413bf Mon Sep 17 00:00:00 2001 From: AinsleySnow Date: Fri, 8 Mar 2024 10:36:59 +0800 Subject: [PATCH 3/4] [VPlan] Fix issue caused by #74761 and D147964 --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 0f4cbe097064..25658b278648 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -275,12 +275,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, IRBuilderBase &Builder = State.Builder; Builder.SetCurrentDebugLocation(getDebugLoc()); - if (Instruction::isBinaryOp(getOpcode())) { + unsigned Opc = getOpcode(); + if (Instruction::isBinaryOp(Opc)) { if (Part != 0 && vputils::onlyFirstPartUsed(this)) return State.get(this, 0); Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *B = nullptr; + + if (UseVectorPredicationIntrinsics && Opc == Instruction::Add) { + // We have the EVL value available to use. + VPValue *VPEVL = getOperand(1); + Value *Step = State.get(VPEVL, 0); + for (unsigned P = 1; P < State.UF; P++) + Step = Builder.CreateAdd(Step, State.get(VPEVL, P)); + + B = Builder.CreateZExtOrTrunc(Step, A->getType()); + } else + B = State.get(getOperand(1), Part); + auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast(Res)) @@ -444,16 +457,14 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Value *Next = nullptr; if (Part == 0) { auto *EVLRecipe = cast(getOperand(0)); - Value *StartEVL = State.get(EVLRecipe->getOperand(0), 0); + Value *StartEVL = EVLRecipe->getOperand(0)->getUnderlyingValue(); Value *IVIncrement = State.get(getOperand(1), 0); Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next"); } else { Next = State.get(this, 0); } - - State.set(this, Next, Part); - break; + return Next; } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -1815,7 +1826,7 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPEVLPHIRecipe::execute(VPTransformState &State) { - Value *StartEVL = State.get(getOperand(0), 0); + Value *StartEVL = getOperand(0)->getUnderlyingValue(); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi"); this->Phi->addIncoming(StartEVL, VectorPH); From f86030c39d8f5216f171123e3106e01e1733679d Mon Sep 17 00:00:00 2001 From: AinsleySnow Date: Fri, 8 Mar 2024 10:49:31 +0800 Subject: [PATCH 4/4] [LV] Update vp intrinsic tests. --- .../LoopVectorize/RISCV/vp_intrinsics.ll | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll index 01d20d24675d..03134f36c6ab 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll @@ -26,7 +26,7 @@ define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: @@ -41,37 +41,39 @@ define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64 -; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP11]], i64 3, i64 1) -; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[VL]] to i32 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP13]], i64 3, i64 1) +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP16]], i64 [[N]]) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i32 0 -; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4:![0-9]+]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP17:%.*]] = add zeroinitializer, [[TMP16]] +; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[VEC_IV]], i32 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP18]], i64 [[N]]) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0 -; CHECK-NEXT: [[VP_LOAD5:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP21:%.*]] = fadd [[VP_LOAD]], [[VP_LOAD5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP22]], i32 0 -; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP21]], ptr [[TMP23]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[VP_LOAD5:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd [[VP_LOAD]], [[VP_LOAD5]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP24]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP23]], ptr [[TMP25]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP26]] ; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -80,10 +82,10 @@ define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] -; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] -; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP28]], [[TMP29]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] ; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1