diff --git a/docs/LangRef.rst b/docs/LangRef.rst index c7ea1c1bf236..cf1ceab1f1c6 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -1289,9 +1289,9 @@ example: that are recognized by LLVM to handle asynchronous exceptions, such as SEH, will still provide their implementation defined semantics. ``optnone`` - This function attribute indicates that the function is not optimized - by any optimization or code generator passes with the - exception of interprocedural optimization passes. + This function attribute indicates that most optimization passes will skip + this function, with the exception of interprocedural optimization passes. + Code generation defaults to the "fast" instruction selector. This attribute cannot be used together with the ``alwaysinline`` attribute; this attribute is also incompatible with the ``minsize`` attribute and the ``optsize`` attribute. @@ -9337,6 +9337,48 @@ Semantics: See the description for :ref:`llvm.stacksave `. +.. _int_get_dynamic_area_offset: + +'``llvm.get.dynamic.area.offset``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.get.dynamic.area.offset.i32() + declare i64 @llvm.get.dynamic.area.offset.i64() + + Overview: + """"""""" + + The '``llvm.get.dynamic.area.offset.*``' intrinsic family is used to + get the offset from native stack pointer to the address of the most + recent dynamic alloca on the caller's stack. These intrinsics are + intendend for use in combination with + :ref:`llvm.stacksave ` to get a + pointer to the most recent dynamic alloca. This is useful, for example, + for AddressSanitizer's stack unpoisoning routines. + +Semantics: +"""""""""" + + These intrinsics return a non-negative integer value that can be used to + get the address of the most recent dynamic alloca, allocated by :ref:`alloca ` + on the caller's stack. In particular, for targets where stack grows downwards, + adding this offset to the native stack pointer would get the address of the most + recent dynamic alloca. For targets where stack grows upwards, the situation is a bit more + complicated, because substracting this value from stack pointer would get the address + one past the end of the most recent dynamic alloca. + + Although for most targets `llvm.get.dynamic.area.offset ` + returns just a zero, for others, such as PowerPC and PowerPC64, it returns a + compile-time-known constant value. + + The return value type of :ref:`llvm.get.dynamic.area.offset ` + must match the target's generic address space's (address space 0) pointer type. + '``llvm.prefetch``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 1bd3b291e0ef..d4360fa8d218 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -371,6 +371,21 @@ bool all_of(R &&Range, UnaryPredicate &&P) { std::forward(P)); } +/// Provide wrappers to std::any_of which take ranges instead of having to pass +/// begin/end explicitly. +template +bool any_of(R &&Range, UnaryPredicate &&P) { + return std::any_of(Range.begin(), Range.end(), + std::forward(P)); +} + +/// Provide wrappers to std::find which take ranges instead of having to pass +/// begin/end explicitly. +template +auto find(R &&Range, const T &val) -> decltype(Range.begin()) { + return std::find(Range.begin(), Range.end(), val); +} + //===----------------------------------------------------------------------===// // Extra additions to //===----------------------------------------------------------------------===// diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h index 194235fac570..700bb9e10ef7 100644 --- a/include/llvm/ADT/StringMap.h +++ b/include/llvm/ADT/StringMap.h @@ -232,6 +232,13 @@ class StringMap : public StringMapImpl { : StringMapImpl(InitialSize, static_cast(sizeof(MapEntryTy))), Allocator(A) {} + StringMap(std::initializer_list> List) + : StringMapImpl(static_cast(sizeof(MapEntryTy))) { + for (const auto &P : List) { + insert(P); + } + } + StringMap(StringMap &&RHS) : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {} diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h index 3e0cc200b6dd..08626dc7af84 100644 --- a/include/llvm/ADT/StringSet.h +++ b/include/llvm/ADT/StringSet.h @@ -23,6 +23,11 @@ namespace llvm { class StringSet : public llvm::StringMap { typedef llvm::StringMap base; public: + StringSet() = default; + StringSet(std::initializer_list S) { + for (StringRef X : S) + insert(X); + } std::pair insert(StringRef Key) { assert(!Key.empty()); diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index e50cec1f5e80..e01db0a61fd5 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -93,6 +93,7 @@ class Triple { enum SubArchType { NoSubArch, + ARMSubArch_v8_2a, ARMSubArch_v8_1a, ARMSubArch_v8, ARMSubArch_v7, diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h index 89dec14b2b3e..69dae5e90785 100644 --- a/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/include/llvm/Analysis/BranchProbabilityInfo.h @@ -61,6 +61,9 @@ class BranchProbabilityInfo { BranchProbability getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const; + BranchProbability getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const; + /// \brief Test if an edge is hot relative to other out-edges of the Src. /// /// Check whether this edge out of the source block is 'hot'. We define hot diff --git a/include/llvm/Analysis/EHPersonalities.h b/include/llvm/Analysis/EHPersonalities.h new file mode 100644 index 000000000000..4a56728fbb4a --- /dev/null +++ b/include/llvm/Analysis/EHPersonalities.h @@ -0,0 +1,83 @@ +//===- EHPersonalities.h - Compute EH-related information -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_EHPERSONALITIES_H +#define LLVM_ANALYSIS_EHPERSONALITIES_H + +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { +class Function; +class Value; + +enum class EHPersonality { + Unknown, + GNU_Ada, + GNU_C, + GNU_CXX, + GNU_ObjC, + MSVC_X86SEH, + MSVC_Win64SEH, + MSVC_CXX, + CoreCLR +}; + +/// \brief See if the given exception handling personality function is one +/// that we understand. If so, return a description of it; otherwise return +/// Unknown. +EHPersonality classifyEHPersonality(const Value *Pers); + +/// \brief Returns true if this personality function catches asynchronous +/// exceptions. +inline bool isAsynchronousEHPersonality(EHPersonality Pers) { + // The two SEH personality functions can catch asynch exceptions. We assume + // unknown personalities don't catch asynch exceptions. + switch (Pers) { + case EHPersonality::MSVC_X86SEH: + case EHPersonality::MSVC_Win64SEH: + return true; + default: + return false; + } + llvm_unreachable("invalid enum"); +} + +/// \brief Returns true if this is a personality function that invokes +/// handler funclets (which must return to it). +inline bool isFuncletEHPersonality(EHPersonality Pers) { + switch (Pers) { + case EHPersonality::MSVC_CXX: + case EHPersonality::MSVC_X86SEH: + case EHPersonality::MSVC_Win64SEH: + case EHPersonality::CoreCLR: + return true; + default: + return false; + } + llvm_unreachable("invalid enum"); +} + +/// \brief Return true if this personality may be safely removed if there +/// are no invoke instructions remaining in the current function. +inline bool isNoOpWithoutInvoke(EHPersonality Pers) { + switch (Pers) { + case EHPersonality::Unknown: + return false; + // All known personalities currently have this behavior + default: + return true; + } + llvm_unreachable("invalid enum"); +} + +bool canSimplifyInvokeNoUnwind(const Function *F); + +} // end namespace llvm + +#endif diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h deleted file mode 100644 index 14ecb55f340b..000000000000 --- a/include/llvm/Analysis/LibCallSemantics.h +++ /dev/null @@ -1,84 +0,0 @@ -//===- LibCallSemantics.h - Describe library semantics --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines interfaces that can be used to describe language specific -// runtime library interfaces (e.g. libc, libm, etc) to LLVM optimizers. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ANALYSIS_LIBCALLSEMANTICS_H -#define LLVM_ANALYSIS_LIBCALLSEMANTICS_H - -#include "llvm/Analysis/AliasAnalysis.h" - -namespace llvm { -class InvokeInst; - - enum class EHPersonality { - Unknown, - GNU_Ada, - GNU_C, - GNU_CXX, - GNU_ObjC, - MSVC_X86SEH, - MSVC_Win64SEH, - MSVC_CXX, - CoreCLR - }; - - /// \brief See if the given exception handling personality function is one - /// that we understand. If so, return a description of it; otherwise return - /// Unknown. - EHPersonality classifyEHPersonality(const Value *Pers); - - /// \brief Returns true if this personality function catches asynchronous - /// exceptions. - inline bool isAsynchronousEHPersonality(EHPersonality Pers) { - // The two SEH personality functions can catch asynch exceptions. We assume - // unknown personalities don't catch asynch exceptions. - switch (Pers) { - case EHPersonality::MSVC_X86SEH: - case EHPersonality::MSVC_Win64SEH: - return true; - default: return false; - } - llvm_unreachable("invalid enum"); - } - - /// \brief Returns true if this is a personality function that invokes - /// handler funclets (which must return to it). - inline bool isFuncletEHPersonality(EHPersonality Pers) { - switch (Pers) { - case EHPersonality::MSVC_CXX: - case EHPersonality::MSVC_X86SEH: - case EHPersonality::MSVC_Win64SEH: - case EHPersonality::CoreCLR: - return true; - default: return false; - } - llvm_unreachable("invalid enum"); - } - - /// \brief Return true if this personality may be safely removed if there - /// are no invoke instructions remaining in the current function. - inline bool isNoOpWithoutInvoke(EHPersonality Pers) { - switch (Pers) { - case EHPersonality::Unknown: - return false; - // All known personalities currently have this behavior - default: return true; - } - llvm_unreachable("invalid enum"); - } - - bool canSimplifyInvokeNoUnwind(const Function *F); - -} // end namespace llvm - -#endif diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index da8c68aa6832..f674cc7ee56f 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -183,7 +183,7 @@ namespace llvm { protected: SCEVPredicateKind Kind; - ~SCEVPredicate() = default; + virtual ~SCEVPredicate(); SCEVPredicate(const SCEVPredicate&) = default; SCEVPredicate &operator=(const SCEVPredicate&) = default; diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 415a85e99069..eedf1a61ba82 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -22,6 +22,7 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" +#include "llvm/Analysis/VectorUtils.h" namespace llvm { @@ -415,21 +416,28 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace()); auto GTI = gep_type_begin(PointerType::get(PointeeType, AS), Operands); for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) { + // We assume that the cost of Scalar GEP with constant index and the + // cost of Vector GEP with splat constant index are the same. + const ConstantInt *ConstIdx = dyn_cast(*I); + if (!ConstIdx) + if (auto Splat = getSplatValue(*I)) + ConstIdx = dyn_cast(Splat); if (isa(*GTI)) { int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType()); - if (const ConstantInt *ConstIdx = dyn_cast(*I)) { + if (ConstIdx) BaseOffset += ConstIdx->getSExtValue() * ElementSize; - } else { + else { // Needs scale register. - if (Scale != 0) { + if (Scale != 0) // No addressing mode takes two scale registers. return TTI::TCC_Basic; - } Scale = ElementSize; } } else { StructType *STy = cast(*GTI); - uint64_t Field = cast(*I)->getZExtValue(); + // For structures the index is always splat or scalar constant + assert(ConstIdx && "Unexpected GEP index"); + uint64_t Field = ConstIdx->getZExtValue(); BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field); } } diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 48ef76a9c8da..531803adf5e4 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -86,7 +86,7 @@ Value *findScalarElement(Value *V, unsigned EltNo); /// \brief Get splat value if the input is a splat vector or return nullptr. /// The value may be extracted from a splat constants vector or from /// a sequence of instructions that broadcast a single value into a vector. -Value *getSplatValue(Value *V); +const Value *getSplatValue(const Value *V); /// \brief Compute a map of integer instructions to their minimum legal type /// size. diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h index 1c08ded1656a..3e127290f378 100644 --- a/include/llvm/Bitcode/ReaderWriter.h +++ b/include/llvm/Bitcode/ReaderWriter.h @@ -71,15 +71,13 @@ namespace llvm { DiagnosticHandlerFunction DiagnosticHandler); /// Parse the specified bitcode buffer, returning the function info index. - /// If ExportingModule is true, check for functions in the index from this - /// module when the combined index is built during parsing and set flag. /// If IsLazy is true, parse the entire function summary into /// the index. Otherwise skip the function summary section, and only create /// an index object with a map from function name to function summary offset. /// The index is used to perform lazy function summary reading later. ErrorOr> getFunctionInfoIndex( MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule = nullptr, bool IsLazy = false); + bool IsLazy = false); /// This method supports lazy reading of function summary data from the /// combined index during function importing. When reading the combined index diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index aaf08e14f57d..4be993a9fbbb 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -754,6 +754,12 @@ namespace ISD { GC_TRANSITION_START, GC_TRANSITION_END, + /// GET_DYNAMIC_AREA_OFFSET - get offset from native SP to the address of + /// the most recent dynamic alloca. For most targets that would be 0, but + /// for some others (e.g. PowerPC, PowerPC64) that would be compile-time + /// known nonzero constant. The only operand here is the chain. + GET_DYNAMIC_AREA_OFFSET, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index a2b1a850ec76..ac87f4f901f5 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -91,13 +91,6 @@ class MachineBasicBlock std::vector Predecessors; std::vector Successors; - /// Keep track of the weights to the successors. This vector has the same - /// order as Successors, or it is empty if we don't use it (disable - /// optimization). - std::vector Weights; - typedef std::vector::iterator weight_iterator; - typedef std::vector::const_iterator const_weight_iterator; - /// Keep track of the probabilities to the successors. This vector has the /// same order as Successors, or it is empty if we don't use it (disable /// optimization). @@ -440,26 +433,16 @@ class MachineBasicBlock // Machine-CFG mutators - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. WEIGHT parameter is stored in Weights - /// list and it may be used by MachineBranchProbabilityInfo analysis to - /// calculate branch probability. - /// - /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0); - - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. The weight is not provided because BPI - /// is not available (e.g. -O0 is used), in which case edge weights won't be - /// used. Using this interface can save some space. - void addSuccessorWithoutWeight(MachineBasicBlock *Succ); - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. PROB parameter is stored in - /// Probabilities list. + /// Probabilities list. The default probability is set as unknown. Mixing + /// known and unknown probabilities in successor list is not allowed. When all + /// successors have unknown probabilities, 1 / N is returned as the + /// probability for each successor, where N is the number of successors. /// /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob); + void addSuccessor(MachineBasicBlock *Succ, + BranchProbability Prob = BranchProbability::getUnknown()); /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. The probability is not provided because @@ -467,9 +450,6 @@ class MachineBasicBlock /// won't be used. Using this interface can save some space. void addSuccessorWithoutProb(MachineBasicBlock *Succ); - /// Set successor weight of a given iterator. - void setSuccWeight(succ_iterator I, uint32_t Weight); - /// Set successor probability of a given iterator. void setSuccProbability(succ_iterator I, BranchProbability Prob); @@ -488,7 +468,7 @@ class MachineBasicBlock /// Return the iterator to the element after the one removed. succ_iterator removeSuccessor(succ_iterator I); - /// Replace successor OLD with NEW and update weight info. + /// Replace successor OLD with NEW and update probability info. void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New); /// Transfers all the successors from MBB to this machine basic block (i.e., @@ -500,9 +480,6 @@ class MachineBasicBlock /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); - /// Return true if any of the successors have weights attached to them. - bool hasSuccessorWeights() const { return !Weights.empty(); } - /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -759,10 +736,6 @@ class MachineBasicBlock private: - /// Return weight iterator corresponding to the I successor iterator. - weight_iterator getWeightIterator(succ_iterator I); - const_weight_iterator getWeightIterator(const_succ_iterator I) const; - /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); const_probability_iterator @@ -771,11 +744,6 @@ class MachineBasicBlock friend class MachineBranchProbabilityInfo; friend class MIPrinter; - /// Return weight of the edge from this block to MBB. This method should NOT - /// be called directly, but by using getEdgeWeight method from - /// MachineBranchProbabilityInfo class. - uint32_t getSuccWeight(const_succ_iterator Succ) const; - /// Return probability of the edge from this block to MBB. This method should /// NOT be called directly, but by using getEdgeProbability method from /// MachineBranchProbabilityInfo class. diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h index 058ab32f3aa9..608e8d257874 100644 --- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h +++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h @@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass { uint32_t getEdgeWeight(const MachineBasicBlock *Src, MachineBasicBlock::const_succ_iterator Dst) const; - // Get sum of the block successors' weights, potentially scaling them to fit - // within 32-bits. If scaling is required, sets Scale based on the necessary - // adjustment. Any edge weights used with the sum should be divided by Scale. - uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const; + // Return edge probability. + BranchProbability getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const; + + // Same as above, but using a const_succ_iterator from Src. This is faster + // when the iterator is already available. + BranchProbability + getEdgeProbability(const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const; // A 'Hot' edge is an edge which probability is >= 80%. bool isEdgeHot(const MachineBasicBlock *Src, @@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass { // NB: This routine's complexity is linear on the number of successors. MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const; - // Return a probability as a fraction between 0 (0% probability) and - // 1 (100% probability), however the value is never equal to 0, and can be 1 - // only iff SRC block has only one successor. - // NB: This routine's complexity is linear on the number of successors of - // Src. Querying sequentially for each successor's probability is a quadratic - // query pattern. - BranchProbability getEdgeProbability(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const; - // Print value between 0 (0% probability) and 1 (100% probability), // however the value is never equal to 0, and can be 1 only iff SRC block // has only one successor. diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h index fd42b46476c5..43b9f5203c50 100644 --- a/include/llvm/CodeGen/MachineModuleInfo.h +++ b/include/llvm/CodeGen/MachineModuleInfo.h @@ -35,7 +35,7 @@ #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h index cbfd8a37eaa6..0a1f62006327 100644 --- a/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/include/llvm/CodeGen/MachineRegisterInfo.h @@ -73,7 +73,7 @@ class MachineRegisterInfo { /// PhysRegUseDefLists - This is an array of the head of the use/def list for /// physical registers. - std::vector PhysRegUseDefLists; + std::unique_ptr PhysRegUseDefLists; /// getRegUseDefListHead - Return the head pointer for the register use/def /// list for the specified virtual or physical register. diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h index 166bd8686891..e296701d8e8c 100644 --- a/include/llvm/CodeGen/RegisterPressure.h +++ b/include/llvm/CodeGen/RegisterPressure.h @@ -328,16 +328,12 @@ class RegPressureTracker { // position changes while pressure does not. void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; } - /// \brief Get the SlotIndex for the first nondebug instruction including or - /// after the current position. - SlotIndex getCurrSlot() const; - /// Recede across the previous instruction. - bool recede(SmallVectorImpl *LiveUses = nullptr, + void recede(SmallVectorImpl *LiveUses = nullptr, PressureDiff *PDiff = nullptr); /// Advance across the current instruction. - bool advance(); + void advance(); /// Finalize the region boundaries and recored live ins and live outs. void closeRegion(); @@ -354,8 +350,7 @@ class RegPressureTracker { ArrayRef getLiveThru() const { return LiveThruPressure; } /// Get the resulting register pressure over the traversed region. - /// This result is complete if either advance() or recede() has returned true, - /// or if closeRegion() was explicitly invoked. + /// This result is complete if closeRegion() was explicitly invoked. RegisterPressure &getPressure() { return P; } const RegisterPressure &getPressure() const { return P; } @@ -365,9 +360,6 @@ class RegPressureTracker { return CurrSetPressure; } - void discoverLiveOut(unsigned Reg); - void discoverLiveIn(unsigned Reg); - bool isTopClosed() const; bool isBottomClosed() const; @@ -442,6 +434,13 @@ class RegPressureTracker { void dump() const; protected: + void discoverLiveOut(unsigned Reg); + void discoverLiveIn(unsigned Reg); + + /// \brief Get the SlotIndex for the first nondebug instruction including or + /// after the current position. + SlotIndex getCurrSlot() const; + const LiveRange *getLiveRange(unsigned Reg) const; void increaseRegPressure(ArrayRef Regs); diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h index 6517a58f5c07..122c78534253 100644 --- a/include/llvm/CodeGen/RegisterScavenging.h +++ b/include/llvm/CodeGen/RegisterScavenging.h @@ -74,10 +74,6 @@ class RegScavenger { /// Start tracking liveness from the begin of the specific basic block. void enterBasicBlock(MachineBasicBlock *mbb); - /// Allow resetting register state info for multiple - /// passes over/within the same function. - void initRegState(); - /// Move the internal MBB iterator and update register states. void forward(); @@ -180,6 +176,9 @@ class RegScavenger { unsigned InstrLimit, MachineBasicBlock::iterator &UseMI); + /// Allow resetting register state info for multiple + /// passes over/within the same function. + void initRegState(); }; } // End llvm namespace diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h index 5def70692ba5..5e8bb56eb617 100644 --- a/include/llvm/CodeGen/WinEHFuncInfo.h +++ b/include/llvm/CodeGen/WinEHFuncInfo.h @@ -22,6 +22,7 @@ namespace llvm { class AllocaInst; class BasicBlock; +class CatchReturnInst; class Constant; class Function; class GlobalVariable; diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h index 0d49d7c0cf82..a85c2f9f0a23 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h @@ -41,12 +41,15 @@ class DWARFUnitIndex { public: class Entry { - const DWARFUnitIndex *Index; - uint64_t Signature; + public: struct SectionContribution { uint32_t Offset; uint32_t Length; }; + + private: + const DWARFUnitIndex *Index; + uint64_t Signature; std::unique_ptr Contributions; friend class DWARFUnitIndex; @@ -55,16 +58,21 @@ class DWARFUnitIndex { const SectionContribution *getOffset() const; }; +private: struct Header Header; + DWARFSectionKind InfoColumnKind; int InfoColumn = -1; std::unique_ptr ColumnKinds; std::unique_ptr Rows; static StringRef getColumnHeader(DWARFSectionKind DS); + bool parseImpl(DataExtractor IndexData); public: bool parse(DataExtractor IndexData); + DWARFUnitIndex(DWARFSectionKind InfoColumnKind) + : InfoColumnKind(InfoColumnKind) {} void dump(raw_ostream &OS) const; const Entry *getFromOffset(uint32_t Offset) const; }; diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index 52a9ca83013b..286a6dae37d8 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -227,6 +227,11 @@ class AttributeSet { AttributeSet addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, StringRef Value) const; + /// Add an attribute to the attribute set at the given indices. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet addAttribute(LLVMContext &C, ArrayRef Indices, + Attribute A) const; + /// \brief Add attributes to the attribute set at the given index. Because /// attribute sets are immutable, this returns a new set. AttributeSet addAttributes(LLVMContext &C, unsigned Index, diff --git a/include/llvm/IR/FunctionInfo.h b/include/llvm/IR/FunctionInfo.h index b8801693ab51..eba088a61bc0 100644 --- a/include/llvm/IR/FunctionInfo.h +++ b/include/llvm/IR/FunctionInfo.h @@ -165,19 +165,8 @@ class FunctionInfoIndex { /// Holds strings for combined index, mapping to the corresponding module ID. ModulePathStringTableTy ModulePathStringTable; - /// The main module being compiled, that we are importing into, if applicable. - /// Used to check if any of its functions are in the index and therefore - /// potentially exported. - const Module *ExportingModule; - - /// Flag indicating whether the exporting module has any functions in the - /// index and therefore potentially exported (imported into another module). - bool HasExportedFunctions; - public: - FunctionInfoIndex(const Module *M = nullptr) - : ExportingModule(M), HasExportedFunctions(false){}; - ~FunctionInfoIndex() = default; + FunctionInfoIndex() = default; // Disable the copy constructor and assignment operators, so // no unexpected copying/moving occurs. @@ -201,14 +190,6 @@ class FunctionInfoIndex { /// Add a function info for a function of the given name. void addFunctionInfo(StringRef FuncName, std::unique_ptr Info) { - // Update the HasExportedFunctions flag, but only if we had a function - // summary (i.e. we aren't parsing them lazily or have a bitcode file - // without a function summary section). - if (ExportingModule && Info->functionSummary()) { - if (ExportingModule->getModuleIdentifier() == - Info->functionSummary()->modulePath()) - HasExportedFunctions = true; - } FunctionMap[FuncName].push_back(std::move(Info)); } @@ -248,11 +229,10 @@ class FunctionInfoIndex { } /// Check if the given Module has any functions available for exporting - /// in the index. - bool hasExportedFunctions(const Module *M) const { - assert(M == ExportingModule && - "Checking for exported functions on unexpected module"); - return HasExportedFunctions; + /// in the index. We consider any module present in the ModulePathStringTable + /// to have exported functions. + bool hasExportedFunctions(const Module &M) const { + return ModulePathStringTable.count(M.getModuleIdentifier()); } }; diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 917cf56e2e88..e838fb332de9 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -304,6 +304,8 @@ def int_stacksave : Intrinsic<[llvm_ptr_ty]>, def int_stackrestore : Intrinsic<[], [llvm_ptr_ty]>, GCCBuiltin<"__builtin_stack_restore">; +def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>; + // IntrReadWriteArgMem is more pessimistic than strictly necessary for prefetch, // however it does conveniently prevent the prefetch from being reordered // with respect to nearby accesses to the same memory. diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 7ec0f4b86f11..57ad278a68bd 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -6304,6 +6304,12 @@ let TargetPrefix = "x86" in { // Compares let TargetPrefix = "x86" in { // 512-bit + def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">, + Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, + llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">, + Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, + llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_pcmpeq_b_512 : GCCBuiltin<"__builtin_ia32_pcmpeqb512_mask">, Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h index 0d3c79bf5e84..c322288a1ae9 100644 --- a/include/llvm/LTO/LTOCodeGenerator.h +++ b/include/llvm/LTO/LTOCodeGenerator.h @@ -39,7 +39,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Linker/Linker.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include @@ -49,6 +48,7 @@ namespace llvm { class LLVMContext; class DiagnosticInfo; class GlobalValue; + class Linker; class Mangler; class MemoryBuffer; class TargetLibraryInfo; @@ -171,7 +171,7 @@ struct LTOCodeGenerator { std::unique_ptr OwnedContext; LLVMContext &Context; std::unique_ptr MergedModule; - Linker IRLinker; + std::unique_ptr IRLinker; std::unique_ptr TargetMach; bool EmitDwarfDebugInfo = false; bool ScopeRestrictionsDone = false; diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h index 610b1ddf9893..0c7dc910a65c 100644 --- a/include/llvm/Linker/Linker.h +++ b/include/llvm/Linker/Linker.h @@ -68,36 +68,33 @@ class Linker { InternalizeLinkedSymbols = (1 << 2) }; - Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler); - Linker(Module *M); + Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler); + Linker(Module &M); - Module *getModule() const { return Composite; } - void deleteModule(); + Module &getModule() const { return Composite; } /// \brief Link \p Src into the composite. The source is destroyed. + /// /// Passing OverrideSymbols as true will have symbols from Src /// shadow those in the Dest. /// For ThinLTO function importing/exporting the \p FunctionInfoIndex - /// is passed. If a \p FuncToImport is provided, only that single - /// function is imported from the source module. + /// is passed. If \p FunctionsToImport is provided, only the functions that + /// are part of the set will be imported from the source module. + /// /// Returns true on error. - bool linkInModule(Module *Src, unsigned Flags = Flags::None, + bool linkInModule(Module &Src, unsigned Flags = Flags::None, const FunctionInfoIndex *Index = nullptr, - Function *FuncToImport = nullptr); + DenseSet *FunctionsToImport = nullptr); - /// \brief Set the composite to the passed-in module. - void setModule(Module *Dst); - - static bool LinkModules(Module *Dest, Module *Src, + static bool linkModules(Module &Dest, Module &Src, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags = Flags::None); - static bool LinkModules(Module *Dest, Module *Src, + static bool linkModules(Module &Dest, Module &Src, unsigned Flags = Flags::None); private: - void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler); - Module *Composite; + Module &Composite; IdentifiedStructTypeSet IdentifiedStructTypes; diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h index eaca3833dc89..388a208fb4a0 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -116,6 +116,9 @@ class MCObjectFileInfo { MCSection *DwarfStrOffDWOSection; MCSection *DwarfAddrSection; + // These are for Fission DWP files. + MCSection *DwarfCUIndexSection; + /// Section for newer gnu pubnames. MCSection *DwarfGnuPubNamesSection; /// Section for newer gnu pubtypes. @@ -262,6 +265,7 @@ class MCObjectFileInfo { MCSection *getDwarfLocDWOSection() const { return DwarfLocDWOSection; } MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; } MCSection *getDwarfAddrSection() const { return DwarfAddrSection; } + MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; } MCSection *getCOFFDebugSymbolsSection() const { return COFFDebugSymbolsSection; diff --git a/include/llvm/Object/FunctionIndexObjectFile.h b/include/llvm/Object/FunctionIndexObjectFile.h index 511a237881ed..74b461dc7cc7 100644 --- a/include/llvm/Object/FunctionIndexObjectFile.h +++ b/include/llvm/Object/FunctionIndexObjectFile.h @@ -88,7 +88,7 @@ class FunctionIndexObjectFile : public SymbolicFile { /// summary/index. static ErrorOr> create(MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule = nullptr, bool IsLazy = false); + bool IsLazy = false); /// \brief Parse the function summary information for function with the /// given name out of the given buffer. Parsed information is @@ -104,8 +104,7 @@ class FunctionIndexObjectFile : public SymbolicFile { /// index object if found, or nullptr if not. ErrorOr> getFunctionIndexForFile(StringRef Path, - DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule = nullptr); + DiagnosticHandlerFunction DiagnosticHandler); } #endif diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h index 13f6c70b3e83..956485119102 100644 --- a/include/llvm/ProfileData/InstrProf.h +++ b/include/llvm/ProfileData/InstrProf.h @@ -428,19 +428,22 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) { if (Counts.size() != Other.Counts.size()) return instrprof_error::count_mismatch; + instrprof_error Result = instrprof_error::success; + for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) { - if (Counts[I] + Other.Counts[I] < Counts[I]) - return instrprof_error::counter_overflow; - Counts[I] += Other.Counts[I]; + bool ResultOverflowed; + Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed); + if (ResultOverflowed) + Result = instrprof_error::counter_overflow; } for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { - instrprof_error result = mergeValueProfData(Kind, Other); - if (result != instrprof_error::success) - return result; + instrprof_error MergeValueResult = mergeValueProfData(Kind, Other); + if (MergeValueResult != instrprof_error::success) + Result = MergeValueResult; } - return instrprof_error::success; + return Result; } inline support::endianness getHostEndianness() { diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc index 8ff7003a0b2d..aefdbc1b3e47 100644 --- a/include/llvm/ProfileData/InstrProfData.inc +++ b/include/llvm/ProfileData/InstrProfData.inc @@ -291,6 +291,7 @@ typedef struct ValueProfData { */ void deserializeTo(InstrProfRecord &Record, InstrProfRecord::ValueMapType *VMap); + void operator delete(void *ptr) { ::operator delete(ptr); } #endif } ValueProfData; @@ -537,12 +538,13 @@ int initializeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord, } NumValueKinds++; RuntimeRecord->SiteCountArray[I] = (uint8_t *)calloc(N, 1); - RuntimeRecord->NodesKind[I] = &RuntimeRecord->Nodes[S]; - if (!RuntimeRecord->NodesKind[I]) + if (!RuntimeRecord->SiteCountArray[I]) return 1; + RuntimeRecord->NodesKind[I] = Nodes ? &Nodes[S] : NULL; for (J = 0; J < N; J++) { + /* Compute value count for each site. */ uint32_t C = 0; - ValueProfNode *Site = RuntimeRecord->Nodes[S + J]; + ValueProfNode *Site = Nodes ? RuntimeRecord->NodesKind[I][J] : NULL; while (Site) { C++; Site = Site->Next; @@ -595,6 +597,8 @@ void getValueForSiteRT(const void *R, InstrProfValueData *Dst, uint32_t VK, unsigned I, N = 0; const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R; N = getNumValueDataForSiteRT(R, VK, S); + if (N == 0) + return; ValueProfNode *VNode = Record->NodesKind[VK][S]; for (I = 0; I < N; I++) { Dst[I] = VNode->VData; diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h index e0a93cec3208..49233366e164 100644 --- a/include/llvm/ProfileData/InstrProfReader.h +++ b/include/llvm/ProfileData/InstrProfReader.h @@ -259,36 +259,50 @@ class InstrProfLookupTrait { } }; -class InstrProfReaderIndex { - private: - typedef OnDiskIterableChainedHashTable IndexType; +struct InstrProfReaderIndexBase { + // Read all the profile records with the same key pointed to the current + // iterator. + virtual std::error_code getRecords(ArrayRef &Data) = 0; + // Read all the profile records with the key equal to FuncName + virtual std::error_code getRecords(StringRef FuncName, + ArrayRef &Data) = 0; + virtual void advanceToNextKey() = 0; + virtual bool atEnd() const = 0; + virtual void setValueProfDataEndianness(support::endianness Endianness) = 0; + virtual ~InstrProfReaderIndexBase() {} +}; + +typedef OnDiskIterableChainedHashTable + OnDiskHashTableImplV3; - std::unique_ptr Index; - IndexType::data_iterator RecordIterator; +template +class InstrProfReaderIndex : public InstrProfReaderIndexBase { + +private: + std::unique_ptr HashTable; + typename HashTableImpl::data_iterator RecordIterator; uint64_t FormatVersion; // String table for holding a unique copy of all the strings in the profile. InstrProfStringTable StringTable; - public: - InstrProfReaderIndex() : Index(nullptr) {} - void Init(const unsigned char *Buckets, const unsigned char *const Payload, - const unsigned char *const Base, IndexedInstrProf::HashT HashType, - uint64_t Version); +public: + InstrProfReaderIndex(const unsigned char *Buckets, + const unsigned char *const Payload, + const unsigned char *const Base, + IndexedInstrProf::HashT HashType, uint64_t Version); - // Read all the pofile records with the same key pointed to the current - // iterator. - std::error_code getRecords(ArrayRef &Data); - // Read all the profile records with the key equal to FuncName + std::error_code getRecords(ArrayRef &Data) override; std::error_code getRecords(StringRef FuncName, - ArrayRef &Data); - - void advanceToNextKey() { RecordIterator++; } - bool atEnd() const { return RecordIterator == Index->data_end(); } - // Used for testing purpose only. - void setValueProfDataEndianness(support::endianness Endianness) { - Index->getInfoObj().setValueProfDataEndianness(Endianness); + ArrayRef &Data) override; + void advanceToNextKey() override { RecordIterator++; } + bool atEnd() const override { + return RecordIterator == HashTable->data_end(); + } + void setValueProfDataEndianness(support::endianness Endianness) override { + HashTable->getInfoObj().setValueProfDataEndianness(Endianness); } + ~InstrProfReaderIndex() override {} }; /// Reader for the indexed binary instrprof format. @@ -297,16 +311,16 @@ class IndexedInstrProfReader : public InstrProfReader { /// The profile data file contents. std::unique_ptr DataBuffer; /// The index into the profile data. - InstrProfReaderIndex Index; + std::unique_ptr Index; /// The maximal execution count among all functions. uint64_t MaxFunctionCount; IndexedInstrProfReader(const IndexedInstrProfReader &) = delete; IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete; - public: +public: IndexedInstrProfReader(std::unique_ptr DataBuffer) - : DataBuffer(std::move(DataBuffer)), Index() {} + : DataBuffer(std::move(DataBuffer)), Index(nullptr) {} /// Return true if the given buffer is in an indexed instrprof format. static bool hasFormat(const MemoryBuffer &DataBuffer); @@ -337,7 +351,7 @@ class IndexedInstrProfReader : public InstrProfReader { // Used for testing purpose only. void setValueProfDataEndianness(support::endianness Endianness) { - Index.setValueProfDataEndianness(Endianness); + Index->setValueProfDataEndianness(Endianness); } }; diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def index f76ac8899359..2f99b0717adf 100644 --- a/include/llvm/Support/ARMTargetParser.def +++ b/include/llvm/Support/ARMTargetParser.def @@ -88,6 +88,9 @@ ARM_ARCH("armv8-a", AK_ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8, ARM_ARCH("armv8.1-a", AK_ARMV8_1A, "8.1-A", "v8.1a", ARMBuildAttrs::CPUArch::v8, FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM | AEK_HWDIV | AEK_DSP | AEK_CRC)) +ARM_ARCH("armv8.2-a", AK_ARMV8_2A, "8.2-A", "v8.2a", ARMBuildAttrs::CPUArch::v8, + FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM | + AEK_HWDIV | AEK_DSP | AEK_CRC)) // Non-standard Arch names. ARM_ARCH("iwmmxt", AK_IWMMXT, "iwmmxt", "", ARMBuildAttrs::CPUArch::v5TE, FK_NONE, AEK_NONE) @@ -115,6 +118,7 @@ ARM_ARCH_EXT_NAME("mp", AEK_MP, nullptr, nullptr) ARM_ARCH_EXT_NAME("simd", AEK_SIMD, nullptr, nullptr) ARM_ARCH_EXT_NAME("sec", AEK_SEC, nullptr, nullptr) ARM_ARCH_EXT_NAME("virt", AEK_VIRT, nullptr, nullptr) +ARM_ARCH_EXT_NAME("fp16", AEK_FP16, "+fullfp16", "-fullfp16") ARM_ARCH_EXT_NAME("os", AEK_OS, nullptr, nullptr) ARM_ARCH_EXT_NAME("iwmmxt", AEK_IWMMXT, nullptr, nullptr) ARM_ARCH_EXT_NAME("iwmmxt2", AEK_IWMMXT2, nullptr, nullptr) @@ -204,6 +208,7 @@ ARM_CPU_NAME("sc300", AK_ARMV7M, FK_NONE, false, AEK_NONE) ARM_CPU_NAME("cortex-m3", AK_ARMV7M, FK_NONE, true, AEK_NONE) ARM_CPU_NAME("cortex-m4", AK_ARMV7EM, FK_FPV4_SP_D16, true, AEK_NONE) ARM_CPU_NAME("cortex-m7", AK_ARMV7EM, FK_FPV5_D16, false, AEK_NONE) +ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC) ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, AEK_CRC) ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC) ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC) diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h index 3620d4d5d772..2548384f346b 100644 --- a/include/llvm/Support/BranchProbability.h +++ b/include/llvm/Support/BranchProbability.h @@ -53,6 +53,9 @@ class BranchProbability { // Create a BranchProbability object with the given numerator and 1<<31 // as denominator. static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); } + // Create a BranchProbability object from 64-bit integers. + static BranchProbability getBranchProbability(uint64_t Numerator, + uint64_t Denominator); // Normalize given probabilties so that the sum of them becomes approximate // one. @@ -131,10 +134,30 @@ class BranchProbability { bool operator==(BranchProbability RHS) const { return N == RHS.N; } bool operator!=(BranchProbability RHS) const { return !(*this == RHS); } - bool operator<(BranchProbability RHS) const { return N < RHS.N; } - bool operator>(BranchProbability RHS) const { return RHS < *this; } - bool operator<=(BranchProbability RHS) const { return !(RHS < *this); } - bool operator>=(BranchProbability RHS) const { return !(*this < RHS); } + + bool operator<(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return N < RHS.N; + } + + bool operator>(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return RHS < *this; + } + + bool operator<=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(RHS < *this); + } + + bool operator>=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(*this < RHS); + } }; inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) { diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h index c47134f46c8d..ac978d4c242c 100644 --- a/include/llvm/Support/OnDiskHashTable.h +++ b/include/llvm/Support/OnDiskHashTable.h @@ -263,11 +263,12 @@ template class OnDiskChainedHashTable { Info InfoObj; public: + typedef Info InfoType; typedef typename Info::internal_key_type internal_key_type; typedef typename Info::external_key_type external_key_type; - typedef typename Info::data_type data_type; - typedef typename Info::hash_value_type hash_value_type; - typedef typename Info::offset_type offset_type; + typedef typename Info::data_type data_type; + typedef typename Info::hash_value_type hash_value_type; + typedef typename Info::offset_type offset_type; OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, const unsigned char *Buckets, diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 6ca0281515e2..c21019d0c5b8 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -82,6 +82,7 @@ enum ArchExtKind : unsigned { AEK_SEC = 0x100, AEK_VIRT = 0x200, AEK_DSP = 0x400, + AEK_FP16 = 0x800, // Unsupported extensions. AEK_OS = 0x8000000, AEK_IWMMXT = 0x10000000, diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index cb5a5796e983..819458dbb0f0 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -130,10 +130,12 @@ class TargetLoweringBase { /// support for these atomic instructions, and also have different options /// w.r.t. what they should expand to. enum class AtomicExpansionKind { - None, // Don't expand the instruction. - LLSC, // Expand the instruction into loadlinked/storeconditional; used - // by ARM/AArch64. - CmpXChg, // Expand the instruction into cmpxchg; used by at least X86. + None, // Don't expand the instruction. + LLSC, // Expand the instruction into loadlinked/storeconditional; used + // by ARM/AArch64. + LLOnly, // Expand the (load) instruction into just a load-linked, which has + // greater atomic guarantees than a normal load. + CmpXChg, // Expand the instruction into cmpxchg; used by at least X86. }; static ISD::NodeType getExtendForContent(BooleanContent Content) { diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index b40e4a69a4d2..b7760a61806f 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -102,6 +102,7 @@ class TargetMachine { const MCSubtargetInfo *STI; unsigned RequireStructuredCFG : 1; + unsigned O0WantsFastISel : 1; /// This API is here to support the C API, deprecated in 3.7 release. /// This should never be used outside of legacy existing client. @@ -190,6 +191,8 @@ class TargetMachine { void setOptLevel(CodeGenOpt::Level Level) const; void setFastISel(bool Enable) { Options.EnableFastISel = Enable; } + bool getO0WantsFastISel() { return O0WantsFastISel; } + void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; } bool shouldPrintMachineCode() const { return Options.PrintMachineCode; } diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h index f06a19021750..0315c72811c1 100644 --- a/include/llvm/Transforms/IPO/FunctionImport.h +++ b/include/llvm/Transforms/IPO/FunctionImport.h @@ -18,15 +18,26 @@ class LLVMContext; class Module; class FunctionInfoIndex; -/// The function importer is automatically importing function from other modules -/// based on the provided summary informations. -class FunctionImporter { +/// Helper to load on demand a Module from file and cache it for subsequent +/// queries. It can be used with the FunctionImporter. +class ModuleLazyLoaderCache { + /// The context that will be used for importing. + LLVMContext &Context; /// Cache of lazily loaded module for import. StringMap> ModuleMap; - /// The context that will be used for importing. - LLVMContext &Context; +public: + /// Create the loader, Module will be initialized in \p Context. + ModuleLazyLoaderCache(LLVMContext &Context) : Context(Context) {} + + /// Retrieve a Module from the cache or lazily load it on demand. + Module &operator()(StringRef FileName); +}; + +/// The function importer is automatically importing function from other modules +/// based on the provided summary informations. +class FunctionImporter { /// The summaries index used to trigger importing. const FunctionInfoIndex &Index; @@ -35,13 +46,15 @@ class FunctionImporter { DiagnosticHandlerFunction DiagnosticHandler; /// Retrieve a Module from the cache or lazily load it on demand. - Module &getOrLoadModule(StringRef FileName); + std::function getLazyModule; public: /// Create a Function Importer. - FunctionImporter(LLVMContext &Context, const FunctionInfoIndex &Index, - DiagnosticHandlerFunction DiagnosticHandler) - : Context(Context), Index(Index), DiagnosticHandler(DiagnosticHandler) {} + FunctionImporter(const FunctionInfoIndex &Index, + DiagnosticHandlerFunction DiagnosticHandler, + std::function ModuleLoader) + : Index(Index), DiagnosticHandler(DiagnosticHandler), + getLazyModule(ModuleLoader) {} /// Import functions in Module \p M based on the summary informations. bool importFunctions(Module &M); diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index b7d67eaea3a0..1d707a1e5307 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -271,10 +271,20 @@ bool LowerDbgDeclare(Function &F); /// an alloca, if any. DbgDeclareInst *FindAllocaDbgDeclare(Value *V); -/// \brief Replaces llvm.dbg.declare instruction when an alloca is replaced with -/// a new value. If Deref is true, an additional DW_OP_deref is prepended to the -/// expression. If Offset is non-zero, a constant displacement is added to the -/// expression (after the optional Deref). Offset can be negative. +/// \brief Replaces llvm.dbg.declare instruction when the address it describes +/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is +/// prepended to the expression. If Offset is non-zero, a constant displacement +/// is added to the expression (after the optional Deref). Offset can be +/// negative. +bool replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset); + +/// \brief Replaces llvm.dbg.declare instruction when the alloca it describes +/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is +/// prepended to the expression. If Offset is non-zero, a constant displacement +/// is added to the expression (after the optional Deref). Offset can be +/// negative. New llvm.dbg.declare is inserted immediately before AI. bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, bool Deref, int Offset = 0); diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index f48394698699..6cdf43a06a9f 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const { return BranchProbability(N, D); } +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const { + return getEdgeProbability(Src, Dst.getSuccessorIndex()); +} + raw_ostream & BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, const BasicBlock *Src, diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index cb5cd07493b6..69623619a8b0 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis DivergenceAnalysis.cpp DomPrinter.cpp DominanceFrontier.cpp + EHPersonalities.cpp GlobalsModRef.cpp IVUsers.cpp InlineCost.cpp @@ -35,7 +36,6 @@ add_llvm_library(LLVMAnalysis IteratedDominanceFrontier.cpp LazyCallGraph.cpp LazyValueInfo.cpp - LibCallSemantics.cpp Lint.cpp Loads.cpp LoopAccessAnalysis.cpp diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/EHPersonalities.cpp similarity index 83% rename from lib/Analysis/LibCallSemantics.cpp rename to lib/Analysis/EHPersonalities.cpp index b91ff20aee25..1d1b5fe11f67 100644 --- a/lib/Analysis/LibCallSemantics.cpp +++ b/lib/Analysis/EHPersonalities.cpp @@ -1,4 +1,4 @@ -//===- LibCallSemantics.cpp - Describe library semantics ------------------===// +//===- EHPersonalities.cpp - Compute EH-related information ---------------===// // // The LLVM Compiler Infrastructure // @@ -6,14 +6,8 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file implements interfaces that can be used to describe language -// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM -// optimizers. -// -//===----------------------------------------------------------------------===// -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Function.h" using namespace llvm; diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 23daeb67d653..9a0570d47f02 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -7964,8 +7964,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr, const MaxExprType *MaxExpr = dyn_cast(MaybeMaxExpr); if (!MaxExpr) return false; - auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate); - return It != MaxExpr->op_end(); + return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end(); } @@ -8403,8 +8402,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, // The only time we can solve this is when we have all constant indices. // Otherwise, we cannot determine the overflow conditions. - if (std::any_of(op_begin(), op_end(), - [](const SCEV *Op) { return !isa(Op);})) + if (any_of(operands(), [](const SCEV *Op) { return !isa(Op); })) return SE.getCouldNotCompute(); // Okay at this point we know that all elements of the chrec are constants and @@ -9645,6 +9643,8 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID, SCEVPredicateKind Kind) : FastID(ID), Kind(Kind) {} +SCEVPredicate::~SCEVPredicate() {} + SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEVUnknown *LHS, const SCEVConstant *RHS) @@ -9694,8 +9694,8 @@ bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const { return false; auto &SCEVPreds = ScevPredsIt->second; - return std::any_of(SCEVPreds.begin(), SCEVPreds.end(), - [N](const SCEVPredicate *I) { return I->implies(N); }); + return any_of(SCEVPreds, + [N](const SCEVPredicate *I) { return I->implies(N); }); } const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; } diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index fbf387b3ee20..5fb517e8edb5 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -417,9 +417,11 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) { /// the input value is (1) a splat constants vector or (2) a sequence /// of instructions that broadcast a single value into a vector. /// -llvm::Value *llvm::getSplatValue(Value *V) { - if (auto *CV = dyn_cast(V)) - return CV->getSplatValue(); +const llvm::Value *llvm::getSplatValue(const Value *V) { + + if (auto *C = dyn_cast(V)) + if (isa(V->getType())) + return C->getSplatValue(); auto *ShuffleInst = dyn_cast(V); if (!ShuffleInst) diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 11c9b131da70..e95aba771b9c 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -5991,12 +5991,11 @@ llvm::getBitcodeProducerString(MemoryBufferRef Buffer, LLVMContext &Context, ErrorOr> llvm::getFunctionInfoIndex(MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule, bool IsLazy) { + bool IsLazy) { std::unique_ptr Buf = MemoryBuffer::getMemBuffer(Buffer, false); FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy); - std::unique_ptr Index = - llvm::make_unique(ExportingModule); + auto Index = llvm::make_unique(); auto cleanupOnError = [&](std::error_code EC) { R.releaseBuffer(); // Never take ownership on error. diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 845408f15f5c..4060db74a9b7 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -364,9 +364,11 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, // If MI's defs have a special allocation requirement, don't allow // any def registers to be changed. Also assume all registers - // defined in a call must not be changed (ABI). + // defined in a call must not be changed (ABI). Inline assembly may + // reference either system calls or the register directly. Skip it until we + // can tell user specified registers from compiler-specified. if (MI->isCall() || MI->hasExtraDefRegAllocReq() || - TII->isPredicated(MI)) { + TII->isPredicated(MI) || MI->isInlineAsm()) { DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)"); State->UnionGroups(Reg, 0); } @@ -428,6 +430,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI, // If MI's uses have special allocation requirement, don't allow // any use registers to be changed. Also assume all registers // used in a call must not be changed (ABI). + // Inline Assembly register uses also cannot be safely changed. // FIXME: The issue with predicated instruction is more complex. We are being // conservatively here because the kill markers cannot be trusted after // if-conversion: @@ -443,7 +446,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI, // changed. bool Special = MI->isCall() || MI->hasExtraSrcRegAllocReq() || - TII->isPredicated(MI); + TII->isPredicated(MI) || MI->isInlineAsm(); // Scan the register uses for this instruction and update // live-ranges, groups and RegRefs. diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp index 8b1bea8049e4..c2c0f84e5c92 100644 --- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp +++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp @@ -27,15 +27,15 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) { auto *Scope = cast(S); StringRef Dir = Scope->getDirectory(), Filename = Scope->getFilename(); - char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)]; - if (Result) - return Result; + std::string &Filepath = + DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)]; + if (!Filepath.empty()) + return Filepath; // Clang emits directory and relative filename info into the IR, but CodeView // operates on full paths. We could change Clang to emit full paths too, but // that would increase the IR size and probably not needed for other users. // For now, just concatenate and canonicalize the path here. - std::string Filepath; if (Filename.find(':') == 1) Filepath = Filename; else @@ -74,8 +74,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) { while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos) Filepath.erase(Cursor, 1); - Result = strdup(Filepath.c_str()); - return StringRef(Result); + return Filepath; } void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL, diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h index 43d1a432712e..78068e07c16f 100644 --- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h +++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h @@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler { } } FileNameRegistry; - typedef std::map, char *> + typedef std::map, std::string> DirAndFilenameToFilepathMapTy; DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap; StringRef getFullFilepath(const MDNode *S); @@ -116,14 +116,6 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler { public: WinCodeViewLineTables(AsmPrinter *Asm); - ~WinCodeViewLineTables() override { - for (DirAndFilenameToFilepathMapTy::iterator - I = DirAndFilenameToFilepathMap.begin(), - E = DirAndFilenameToFilepathMap.end(); - I != E; ++I) - free(I->second); - } - void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {} /// \brief Emit the COFF section that holds the line table information. diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index e7998db4a7c1..e4b7c5a62780 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -51,7 +51,9 @@ namespace { bool expandAtomicLoadToCmpXchg(LoadInst *LI); bool expandAtomicStore(StoreInst *SI); bool tryExpandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicRMWToLLSC(AtomicRMWInst *AI); + bool expandAtomicOpToLLSC( + Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, + std::function &, Value *)> PerformOp); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool isIdempotentRMW(AtomicRMWInst *AI); bool simplifyIdempotentRMW(AtomicRMWInst *AI); @@ -174,13 +176,15 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { switch (TLI->shouldExpandAtomicLoadInIR(LI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; - case TargetLoweringBase::AtomicExpansionKind::LLSC: { + case TargetLoweringBase::AtomicExpansionKind::LLSC: + return expandAtomicOpToLLSC( + LI, LI->getPointerOperand(), LI->getOrdering(), + [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); + case TargetLoweringBase::AtomicExpansionKind::LLOnly: return expandAtomicLoadToLL(LI); - } - case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { + case TargetLoweringBase::AtomicExpansionKind::CmpXChg: return expandAtomicLoadToCmpXchg(LI); } - } llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); } @@ -192,6 +196,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { // to be single-copy atomic by ARM is an ldrexd (A3.5.3). Value *Val = TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); + TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); LI->replaceAllUsesWith(Val); LI->eraseFromParent(); @@ -245,20 +250,6 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); } -bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { - case TargetLoweringBase::AtomicExpansionKind::None: - return false; - case TargetLoweringBase::AtomicExpansionKind::LLSC: { - return expandAtomicRMWToLLSC(AI); - } - case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { - return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); - } - } - llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); -} - /// Emit IR to implement the given atomicrmw operation on values in registers, /// returning the new value. static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, @@ -296,10 +287,28 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } } -bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { - AtomicOrdering MemOpOrder = AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); +bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { + switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + case TargetLoweringBase::AtomicExpansionKind::None: + return false; + case TargetLoweringBase::AtomicExpansionKind::LLSC: + return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), + [&](IRBuilder<> &Builder, Value *Loaded) { + return performAtomicOp(AI->getOperation(), + Builder, Loaded, + AI->getValOperand()); + }); + case TargetLoweringBase::AtomicExpansionKind::CmpXChg: + return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + default: + llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); + } +} + +bool AtomicExpand::expandAtomicOpToLLSC( + Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, + std::function &, Value *)> PerformOp) { + BasicBlock *BB = I->getParent(); Function *F = BB->getParent(); LLVMContext &Ctx = F->getContext(); @@ -317,11 +326,11 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { // atomicrmw.end: // fence? // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); + BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); + // This grabs the DebugLoc from I. + IRBuilder<> Builder(I); // The split call above "helpfully" added a branch at the end of BB (to the // wrong place), but we might want a fence too. It's easiest to just remove @@ -334,8 +343,7 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { Builder.SetInsertPoint(LoopBB); Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + Value *NewVal = PerformOp(Builder, Loaded); Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); @@ -345,8 +353,8 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - AI->replaceAllUsesWith(Loaded); - AI->eraseFromParent(); + I->replaceAllUsesWith(Loaded); + I->eraseFromParent(); return true; } diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 0b2495cc996e..54d92ad67a97 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -1099,13 +1099,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) { if (TailMBB.succ_size() <= 1) return; - auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end()); - uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1; + auto SumEdgeFreq = + std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0)) + .getFrequency(); auto EdgeFreq = EdgeFreqLs.begin(); - for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); - SuccI != SuccE; ++SuccI, ++EdgeFreq) - TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale); + if (SumEdgeFreq > 0) { + for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); + SuccI != SuccE; ++SuccI, ++EdgeFreq) { + auto Prob = BranchProbability::getBranchProbability( + EdgeFreq->getFrequency(), SumEdgeFreq); + TailMBB.setSuccProbability(SuccI, Prob); + } + } } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp index 0f6e1463f10f..eae78a950d9a 100644 --- a/lib/CodeGen/DwarfEHPrepare.cpp +++ b/lib/CodeGen/DwarfEHPrepare.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 0b2f3ea165f8..e90cb02bd280 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -32,6 +32,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include using namespace llvm; @@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { return true; } -/// Scale down weights to fit into uint32_t. NewTrue is the new weight -/// for successor TrueBB, and NewFalse is the new weight for successor -/// FalseBB. -static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse, - MachineBasicBlock *MBB, - const MachineBasicBlock *TrueBB, - const MachineBasicBlock *FalseBB, - const MachineBranchProbabilityInfo *MBPI) { - uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; - uint32_t Scale = (NewMax / UINT32_MAX) + 1; - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); - SI != SE; ++SI) { - if (*SI == TrueBB) - MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale)); - else if (*SI == FalseBB) - MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale)); - else - MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale); - } -} - /// IfConvertTriangle - If convert a triangle sub-CFG. /// bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { @@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { DontKill.clear(); bool HasEarlyExit = CvtBBI->FalseBB != nullptr; - uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0; - uint32_t WeightScale = 0; + BranchProbability CvtNext, CvtFalse, BBNext, BBCvt; if (HasEarlyExit) { - // Get weights before modifying CvtBBI->BB and BBI.BB. - CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB); - CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB); - BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB); - BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB); - SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale); + // Get probabilities before modifying CvtBBI->BB and BBI.BB. + CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB); + CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB); + BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB); + BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB); } if (CvtBBI->BB->pred_size() > 1) { @@ -1266,22 +1243,23 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { CvtBBI->BrCond.end()); if (TII->ReverseBranchCondition(RevCond)) llvm_unreachable("Unable to reverse branch condition!"); + + // Update the edge probability for both CvtBBI->FalseBB and NextBBI. + // NewNext = New_Prob(BBI.BB, NextBBI->BB) = + // Prob(BBI.BB, NextBBI->BB) + + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB) + // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) = + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB) + auto NewTrueBB = getNextBlock(BBI.BB); + auto NewNext = BBNext + BBCvt * CvtNext; + auto NewTrueBBIter = + std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB); + if (NewTrueBBIter != BBI.BB->succ_end()) + BBI.BB->setSuccProbability(NewTrueBBIter, NewNext); + + auto NewFalse = BBCvt * CvtFalse; TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl); - BBI.BB->addSuccessor(CvtBBI->FalseBB); - // Update the edge weight for both CvtBBI->FalseBB and NextBBI. - // New_Weight(BBI.BB, NextBBI->BB) = - // Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) + - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB) - // New_Weight(BBI.BB, CvtBBI->FalseBB) = - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB) - - uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale; - uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale; - // We need to scale down all weights of BBI.BB to fit uint32_t. - // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to - // the next block. - ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB), - CvtBBI->FalseBB, MBPI); + BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse); } // Merge in the 'false' block if the 'false' block has no other @@ -1524,7 +1502,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, MergeBlocks(BBI, TailBBI); TailBBI.IsDone = true; } else { - BBI.BB->addSuccessor(TailBB); + BBI.BB->addSuccessor(TailBB, BranchProbability::getOne()); InsertUncondBranch(BBI.BB, TailBB, TII); BBI.HasFallThrough = false; } @@ -1688,21 +1666,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { FromBBI.BB->succ_end()); MachineBasicBlock *NBB = getNextBlock(FromBBI.BB); MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr; - - // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when + // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. - uint32_t To2FromWeight = 0; - // WeightScale and SumWeight are for calculating successor probabilities of - // FromBBI.BB. - uint32_t WeightScale = 0; - uint32_t SumWeight = 0; + auto To2FromProb = BranchProbability::getZero(); if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { - To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB); - // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge - // weight being merged to other edges when this edge is removed later. - ToBBI.BB->setSuccWeight( - std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0); - SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale); + To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB); + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); + } + + if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); } for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) { @@ -1711,39 +1694,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { if (Succ == FallThrough) continue; - uint32_t NewWeight = 0; + auto NewProb = BranchProbability::getZero(); if (AddEdges) { - // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is - // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio - // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a - // successor of ToBBI.BB. See comment below for excepion). - NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ); + // Calculate the edge probability for the edge from ToBBI.BB to Succ, + // which is a portion of the edge probability from FromBBI.BB to Succ. The + // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if + // FromBBI is a successor of ToBBI.BB. See comment below for excepion). + NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ); - // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This + // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This // only happens when if-converting a diamond CFG and FromBBI.BB is the // tail BB. In this case FromBBI.BB post-dominates ToBBI.BB and hence we - // could just use the weights on FromBBI.BB's out-edges when adding new - // successors. - if (To2FromWeight > 0) { - BranchProbability Prob(NewWeight / WeightScale, SumWeight); - NewWeight = Prob.scale(To2FromWeight); - } + // could just use the probabilities on FromBBI.BB's out-edges when adding + // new successors. + if (!To2FromProb.isZero()) + NewProb *= To2FromProb; } FromBBI.BB->removeSuccessor(Succ); if (AddEdges) { - // If the edge from ToBBI.BB to Succ already exists, update the weight of - // this edge by adding NewWeight to it. An example is shown below, in - // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to - // set C as A's successor as it already is. We only need to update the - // edge weight on A->C. Note that B will not be immediately removed from - // A's successors. It is possible that B->D is not removed either if D is - // a fallthrough of B. Later the edge A->D (generated here) and B->D will - // be combined into one edge. To maintain correct edge weight of this - // combined edge, we need to set the edge weight of A->B to zero, which is - // already done above. The edge weight on A->D is calculated by scaling - // the original weight on A->B by the probability of B->D. + // If the edge from ToBBI.BB to Succ already exists, update the + // probability of this edge by adding NewWeight to it. An example is shown + // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we + // don't have to set C as A's successor as it already is. We only need to + // update the edge probability on A->C. Note that B will not be + // immediately removed from A's successors. It is possible that B->D is + // not removed either if D is a fallthrough of B. Later the edge A->D + // (generated here) and B->D will be combined into one edge. To maintain + // correct edge probability of this combined edge, we need to set the edge + // probability of A->B to zero, which is already done above. The edge + // probability on A->D is calculated by scaling the original probability + // on A->B by the probability of B->D. // // Before ifcvt: After ifcvt (assume B->D is kept): // @@ -1755,11 +1737,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // C D C D // if (ToBBI.BB->isSuccessor(Succ)) - ToBBI.BB->setSuccWeight( + ToBBI.BB->setSuccProbability( std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ), - MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight); + MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb); else - ToBBI.BB->addSuccessor(Succ, NewWeight); + ToBBI.BB->addSuccessor(Succ, NewProb); } } diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index 5b895fff5c43..47a9f64e9080 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { break; } + case Intrinsic::get_dynamic_area_offset: + errs() << "WARNING: this target does not support the custom llvm.get." + "dynamic.area.offset. It is being lowered to a constant 0\n"; + // Just lower it to a constant 0 because for most targets + // @llvm.get.dynamic.area.offset is lowered to zero. + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0)); + break; case Intrinsic::returnaddress: case Intrinsic::frameaddress: errs() << "WARNING: this target does not support the llvm." diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 5b8c8258b285..da24cb17918b 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, PM.add(new MachineFunctionAnalysis(*TM, MFInitializer)); // Enable FastISel with -fast, but allow that to be overridden. + TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); if (EnableFastISelOption == cl::BOU_TRUE || (TM->getOptLevel() == CodeGenOpt::None && - EnableFastISelOption != cl::BOU_FALSE)) + TM->getO0WantsFastISel())) TM->setFastISel(true); // Ask the target for an isel. diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index 5a8e96df7603..c9c2d62cec30 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { if (expectAndConsume(MIToken::rparen)) return true; } - MBB.addSuccessor(SuccMBB, Weight); + MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight)); } while (consumeIfPresent(MIToken::comma)); + MBB.normalizeSuccProbs(); return false; } diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 0be7807064fb..175cb0d51437 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorWeights()) - OS << '(' << MBB.getSuccWeight(I) << ')'; + if (MBB.hasSuccessorProbabilities()) + OS << '(' << MBB.getSuccProbability(I) << ')'; } OS << "\n"; HasLineAttributes = true; diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 602b75182fca..de91f0db75a8 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << " Successors according to CFG:"; for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) { OS << " BB#" << (*SI)->getNumber(); - if (!Weights.empty()) - OS << '(' << *getWeightIterator(SI) << ')'; + if (!Probs.empty()) + OS << '(' << *getProbabilityIterator(SI) << ')'; } OS << '\n'; } @@ -506,35 +506,12 @@ void MachineBasicBlock::updateTerminator() { } } -void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) { - // Weight list is either empty (if successor list isn't empty, this means - // disabled optimization) or has the same size as successor list. - if (!(Weights.empty() && !Successors.empty())) - Weights.push_back(Weight); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - -void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) { - // We need to make sure weight list is either empty or has the same size of - // successor list. When this function is called, we can safely delete all - // weight in the list. - Weights.clear(); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob) { // Probability list is either empty (if successor list isn't empty, this means // disabled optimization) or has the same size as successor list. - if (!(Probs.empty() && !Successors.empty())) { + if (!(Probs.empty() && !Successors.empty())) Probs.push_back(Prob); - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaced with probability-version interfaces. - Weights.push_back(Prob.getNumerator()); - } Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -544,7 +521,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) { // of successor list. When this function is called, we can safely delete all // probability in the list. Probs.clear(); - Weights.clear(); Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -558,23 +534,12 @@ MachineBasicBlock::succ_iterator MachineBasicBlock::removeSuccessor(succ_iterator I) { assert(I != Successors.end() && "Not a current successor!"); - // If Weight list is empty it means we don't use it (disabled optimization). - if (!Weights.empty()) { - weight_iterator WI = getWeightIterator(I); - Weights.erase(WI); - } - - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // If probability list is empty it means we don't use it (disabled // optimization). if (!Probs.empty()) { probability_iterator WI = getProbabilityIterator(I); Probs.erase(WI); } -#endif (*I)->removePredecessor(this); return Successors.erase(I); @@ -611,17 +576,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, } // New is already a successor. - // Update its weight instead of adding a duplicate edge. - if (!Weights.empty()) - *getWeightIterator(NewI) += *getWeightIterator(OldI); - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // Update its probability instead of adding a duplicate edge. - if (!Probs.empty()) - *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI); -#endif + if (!Probs.empty()) { + auto ProbIter = getProbabilityIterator(NewI); + if (!ProbIter->isUnknown()) + *ProbIter += *getProbabilityIterator(OldI); + } removeSuccessor(OldI); } @@ -641,13 +601,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - // If Weight list is empty it means we don't use it (disabled optimization). - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); + // If probability list is empty it means we don't use it (disabled optimization). + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); - addSuccessor(Succ, Weight); FromMBB->removeSuccessor(Succ); } } @@ -659,10 +620,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); - addSuccessor(Succ, Weight); + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); FromMBB->removeSuccessor(Succ); // Fix up any PHI nodes in the successor. @@ -1146,80 +1108,51 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return DL; } -/// Return weight of the edge from this block to MBB. -uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const { - if (Weights.empty()) - return 0; - - return *getWeightIterator(Succ); -} - -/// Return probability of the edge from this block to MBB. If probability list -/// is empty, return a default probability which is 1/N, where N is the number -/// of successors. If the probability of the given successor is unknown, then -/// sum up all known probabilities and return the complement of the sum divided -/// by the number of unknown probabilities. +/// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { if (Probs.empty()) return BranchProbability(1, succ_size()); - auto Prob = *getProbabilityIterator(Succ); - assert(!Prob.isUnknown()); - return Prob; -} - -/// Set successor weight of a given iterator. -void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) { - if (Weights.empty()) - return; - *getWeightIterator(I) = Weight; + const auto &Prob = *getProbabilityIterator(Succ); + if (Prob.isUnknown()) { + // For unknown probabilities, collect the sum of all known ones, and evenly + // ditribute the complemental of the sum to each unknown probability. + unsigned KnownProbNum = 0; + auto Sum = BranchProbability::getZero(); + for (auto &P : Probs) { + if (!P.isUnknown()) { + Sum += P; + KnownProbNum++; + } + } + return Sum.getCompl() / (Probs.size() - KnownProbNum); + } else + return Prob; } /// Set successor probability of a given iterator. void MachineBasicBlock::setSuccProbability(succ_iterator I, BranchProbability Prob) { assert(!Prob.isUnknown()); - if (Probs.empty() || Weights.empty()) + if (Probs.empty()) return; *getProbabilityIterator(I) = Prob; - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaces with probability-version interfaces. - *getWeightIterator(I) = Prob.getNumerator(); } -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::succ_iterator I) { - assert(Weights.size() == Successors.size() && "Async weight list!"); - size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; -} - -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::const_weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::const_succ_iterator I) const { - assert(Weights.size() == Successors.size() && "Async weight list!"); - const size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; -} - -/// Return probability iterator corresonding to the I successor iterator. -MachineBasicBlock::probability_iterator -MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { +/// Return probability iterator corresonding to the I successor iterator +MachineBasicBlock::const_probability_iterator +MachineBasicBlock::getProbabilityIterator( + MachineBasicBlock::const_succ_iterator I) const { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); return Probs.begin() + index; } -/// Return probability iterator corresonding to the I successor iterator -MachineBasicBlock::const_probability_iterator -MachineBasicBlock::getProbabilityIterator( - MachineBasicBlock::const_succ_iterator I) const { +/// Return probability iterator corresonding to the I successor iterator. +MachineBasicBlock::probability_iterator +MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index fba33eb93d5f..fcddf346cf68 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we manually compute probabilities using the edge - // weights. This is suboptimal as it means that the somewhat subtle - // definition of edge weight semantics is encoded here as well. We should - // improve the MBPI interface to efficiently support query patterns such as - // this. - uint32_t BestWeight = 0; - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - - // Adjust sum of weights by excluding weights on edges pointing to blocks that - // is either not in BlockFilter or is already in the current chain. Consider - // the following CFG: + auto BestProb = BranchProbability::getZero(); + + // Adjust edge probabilities by excluding edges pointing to blocks that is + // either not in BlockFilter or is already in the current chain. Consider the + // following CFG: // // --->A // | / \ @@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // HotProb). If we exclude E that is not in BlockFilter when calculating the // probability of C->D, D will be selected and we will get A C D B as the // layout of this loop. - uint32_t AdjustedSumWeight = SumWeight; + auto AdjustedSumProb = BranchProbability::getOne(); SmallVector Successors; for (MachineBasicBlock *Succ : BB->successors()) { bool SkipSucc = false; @@ -424,15 +416,20 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, } } if (SkipSucc) - AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale; + AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); else Successors.push_back(Succ); } DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight); + BranchProbability SuccProb; + uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); + uint32_t SuccProbD = AdjustedSumProb.getNumerator(); + if (SuccProbN >= SuccProbD) + SuccProb = BranchProbability::getOne(); + else + SuccProb = BranchProbability(SuccProbN, SuccProbD); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other @@ -470,7 +467,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Make sure that a hot successor doesn't have a globally more // important predecessor. - BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight); + auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); bool BadCFGConflict = false; @@ -496,10 +493,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << " (prob)" << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestWeight >= SuccWeight) + if (BestSucc && BestProb >= SuccProb) continue; BestSucc = Succ; - BestWeight = SuccWeight; + BestProb = SuccProb; } return BestSucc; } @@ -728,11 +725,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, MachineBasicBlock *OldExitingBB = ExitingBB; BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; bool HasLoopingSucc = false; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we use the internal weights and manually compute the - // probabilities to avoid quadratic behavior. - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale); for (MachineBasicBlock *Succ : MBB->successors()) { if (Succ->isEHPad()) continue; @@ -746,10 +738,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, continue; } - uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ); + auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); if (LoopBlockSet.count(Succ)) { DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " - << getBlockName(Succ) << " (" << SuccWeight << ")\n"); + << getBlockName(Succ) << " (" << SuccProb << ")\n"); HasLoopingSucc = true; continue; } @@ -761,7 +753,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, BlocksExitingToOuterLoop.insert(MBB); } - BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; @@ -904,21 +895,17 @@ void MachineBlockPlacement::rotateLoopWithProfile( // edge from the tail of the loop chain. SmallVector, 4> ExitsWithFreq; for (auto BB : LoopChain) { - uint32_t LargestExitEdgeWeight = 0; + auto LargestExitEdgeProb = BranchProbability::getZero(); for (auto *Succ : BB->successors()) { BlockChain *SuccChain = BlockToChain[Succ]; if (!LoopBlockSet.count(Succ) && (!SuccChain || Succ == *SuccChain->begin())) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); + auto SuccProb = MBPI->getEdgeProbability(BB, Succ); + LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); } } - if (LargestExitEdgeWeight > 0) { - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - auto ExitFreq = - MBFI->getBlockFreq(BB) * - BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); + if (LargestExitEdgeProb > BranchProbability::getZero()) { + auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; ExitsWithFreq.emplace_back(BB, ExitFreq); } } @@ -1290,14 +1277,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher weight first. + // such that we branch to the successor with higher probability first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) && + MBPI->getEdgeProbability(PrevBB, FBB) > + MBPI->getEdgeProbability(PrevBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " << getBlockName(PrevBB) << "\n"); - DEBUG(dbgs() << " Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB) - << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n"); + DEBUG(dbgs() << " Edge probability: " + << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " + << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere TII->RemoveBranch(*PrevBB); TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 6fbc2be70486..5478dcba261a 100644 --- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0; void MachineBranchProbabilityInfo::anchor() { } -uint32_t MachineBranchProbabilityInfo:: -getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const { - // First we compute the sum with 64-bits of precision, ensuring that cannot - // overflow by bounding the number of weights considered. Hopefully no one - // actually needs 2^32 successors. - assert(MBB->succ_size() < UINT32_MAX); - uint64_t Sum = 0; - Scale = 1; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight; - } - - // If the computed sum fits in 32-bits, we're done. - if (Sum <= UINT32_MAX) - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst).getNumerator(); +} - // Otherwise, compute the scale necessary to cause the weights to fit, and - // re-sum with that scale applied. - assert((Sum / UINT32_MAX) < UINT32_MAX); - Scale = (Sum / UINT32_MAX) + 1; - Sum = 0; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight / Scale; - } - assert(Sum <= UINT32_MAX); - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { + // This is a linear search. Try to use the const_succ_iterator version when + // possible. + return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - uint32_t Weight = Src->getSuccWeight(Dst); - if (!Weight) - return DEFAULT_WEIGHT; - return Weight; +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const { +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // This is a linear search. Try to use the const_succ_iterator version when // possible. - return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); + return getEdgeProbability(Src, + std::find(Src->succ_begin(), Src->succ_end(), Dst)); } bool MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% - // FIXME: Compare against a static "hot" BranchProbability. - return getEdgeProbability(Src, Dst) > BranchProbability(4, 5); + static BranchProbability HotProb(4, 5); + return getEdgeProbability(Src, Dst) > HotProb; } MachineBasicBlock * MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { - uint32_t MaxWeight = 0; + auto MaxProb = BranchProbability::getZero(); MachineBasicBlock *MaxSucc = nullptr; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - if (Weight > MaxWeight) { - MaxWeight = Weight; + auto Prob = getEdgeProbability(MBB, I); + if (Prob > MaxProb) { + MaxProb = Prob; MaxSucc = *I; } } - if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5)) + static BranchProbability HotProb(4, 5); + if (getEdgeProbability(MBB, MaxSucc) >= HotProb) return MaxSucc; return nullptr; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { - uint32_t Scale = 1; - uint32_t D = getSumForBlock(Src, Scale); - uint32_t N = getEdgeWeight(Src, Dst) / Scale; - - return BranchProbability(N, D); -} - raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability( raw_ostream &OS, const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 7e6af1c9c41f..80d30a5b131a 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -17,7 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionInitializer.h" diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp index 18efcf39c453..1956a701d8e6 100644 --- a/lib/CodeGen/MachineModuleInfo.cpp +++ b/lib/CodeGen/MachineModuleInfo.cpp @@ -10,7 +10,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index abf9b4d67696..03c82f46da63 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -27,12 +27,11 @@ void MachineRegisterInfo::Delegate::anchor() {} MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF) : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true), TracksSubRegLiveness(false) { + unsigned NumRegs = getTargetRegisterInfo()->getNumRegs(); VRegInfo.reserve(256); RegAllocHints.reserve(256); - UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs()); - - // Create the physreg use/def lists. - PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr); + UsedPhysRegMask.resize(NumRegs); + PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]()); } /// setRegClass - Set the register class of the specified virtual register. diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index e1020772629c..cdcd8eb4fbdf 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -28,7 +28,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index 6ff16e551be2..6e7feb5178ee 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -59,12 +59,12 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { dbgs() << "Max Pressure: "; dumpRegSetPressure(MaxSetPressure, TRI); dbgs() << "Live In: "; - for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i) - dbgs() << PrintVRegOrUnit(LiveInRegs[i], TRI) << " "; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, TRI) << " "; dbgs() << '\n'; dbgs() << "Live Out: "; - for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i) - dbgs() << PrintVRegOrUnit(LiveOutRegs[i], TRI) << " "; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, TRI) << " "; dbgs() << '\n'; } @@ -92,8 +92,8 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const { /// Increase the current pressure as impacted by these registers and bump /// the high water mark if needed. void RegPressureTracker::increaseRegPressure(ArrayRef RegUnits) { - for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { - PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]); + for (unsigned RegUnit : RegUnits) { + PSetIterator PSetI = MRI->getPressureSets(RegUnit); unsigned Weight = PSetI.getWeight(); for (; PSetI.isValid(); ++PSetI) { CurrSetPressure[*PSetI] += Weight; @@ -106,8 +106,8 @@ void RegPressureTracker::increaseRegPressure(ArrayRef RegUnits) { /// Simply decrease the current pressure as impacted by these registers. void RegPressureTracker::decreaseRegPressure(ArrayRef RegUnits) { - for (unsigned I = 0, E = RegUnits.size(); I != E; ++I) - decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I])); + for (unsigned RegUnit : RegUnits) + decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit)); } /// Clear the result so it can be used for another round of pressure tracking. @@ -298,8 +298,7 @@ void RegPressureTracker::closeRegion() { void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0); assert(isBottomClosed() && "need bottom-up tracking to intialize."); - for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) { - unsigned Reg = P.LiveOutRegs[i]; + for (unsigned Reg : P.LiveOutRegs) { if (TargetRegisterInfo::isVirtualRegister(Reg) && !RPTracker.hasUntiedDef(Reg)) { increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg)); @@ -314,71 +313,88 @@ static bool containsReg(ArrayRef RegUnits, unsigned RegUnit) { } namespace { -/// Collect this instruction's unique uses and defs into SmallVectors for -/// processing defs and uses in order. -/// -/// FIXME: always ignore tied opers -class RegisterOperands { - const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - bool IgnoreDead; +/// List of register defined and used by a machine instruction. +class RegisterOperands { public: SmallVector Uses; SmallVector Defs; SmallVector DeadDefs; - RegisterOperands(const TargetRegisterInfo *tri, - const MachineRegisterInfo *mri, bool ID = false): - TRI(tri), MRI(mri), IgnoreDead(ID) {} + void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, bool IgnoreDead = false); +}; + +/// Collect this instruction's unique uses and defs into SmallVectors for +/// processing defs and uses in order. +/// +/// FIXME: always ignore tied opers +class RegisterOperandsCollector { + RegisterOperands &RegOpers; + const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; + bool IgnoreDead; - /// Push this operand's register onto the correct vector. - void collect(const MachineOperand &MO) { + RegisterOperandsCollector(RegisterOperands &RegOpers, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) + : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {} + + void collectInstr(const MachineInstr &MI) const { + for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI) + collectOperand(*OperI); + + // Remove redundant physreg dead defs. + SmallVectorImpl::iterator I = + std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), + std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); + RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); + } + + /// Push this operand's register onto the correct vectors. + void collectOperand(const MachineOperand &MO) const { if (!MO.isReg() || !MO.getReg()) return; + unsigned Reg = MO.getReg(); if (MO.readsReg()) - pushRegUnits(MO.getReg(), Uses); + pushRegUnits(Reg, RegOpers.Uses); if (MO.isDef()) { if (MO.isDead()) { if (!IgnoreDead) - pushRegUnits(MO.getReg(), DeadDefs); - } - else - pushRegUnits(MO.getReg(), Defs); + pushRegUnits(Reg, RegOpers.DeadDefs); + } else + pushRegUnits(Reg, RegOpers.Defs); } } -protected: - void pushRegUnits(unsigned Reg, SmallVectorImpl &RegUnits) { + void pushRegUnits(unsigned Reg, SmallVectorImpl &RegUnits) const { if (TargetRegisterInfo::isVirtualRegister(Reg)) { if (containsReg(RegUnits, Reg)) return; RegUnits.push_back(Reg); - } - else if (MRI->isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { + } else if (MRI.isAllocatable(Reg)) { + for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) { if (containsReg(RegUnits, *Units)) continue; RegUnits.push_back(*Units); } } } + + friend class RegisterOperands; }; -} // namespace -/// Collect physical and virtual register operands. -static void collectOperands(const MachineInstr *MI, - RegisterOperands &RegOpers) { - for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) - RegOpers.collect(*OperI); - - // Remove redundant physreg dead defs. - SmallVectorImpl::iterator I = - std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), - std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); - RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); +void RegisterOperands::collect(const MachineInstr &MI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) { + RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead); + Collector.collectInstr(MI); } +} // namespace + /// Initialize an array of N PressureDiffs. void PressureDiffs::init(unsigned N) { Size = N; @@ -432,18 +448,18 @@ static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers, const MachineRegisterInfo *MRI) { assert(!PDiff.begin()->isValid() && "stale PDiff"); - for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i) - PDiff.addPressureChange(RegOpers.Defs[i], true, MRI); + for (unsigned Reg : RegOpers.Defs) + PDiff.addPressureChange(Reg, true, MRI); - for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i) - PDiff.addPressureChange(RegOpers.Uses[i], false, MRI); + for (unsigned Reg : RegOpers.Uses) + PDiff.addPressureChange(Reg, false, MRI); } /// Force liveness of registers. void RegPressureTracker::addLiveRegs(ArrayRef Regs) { - for (unsigned i = 0, e = Regs.size(); i != e; ++i) { - if (LiveRegs.insert(Regs[i])) - increaseRegPressure(Regs[i]); + for (unsigned Reg : Regs) { + if (LiveRegs.insert(Reg)) + increaseRegPressure(Reg); } } @@ -474,13 +490,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) { /// registers that are both defined and used by the instruction. If a pressure /// difference pointer is provided record the changes is pressure caused by this /// instruction independent of liveness. -bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, +void RegPressureTracker::recede(SmallVectorImpl *LiveUses, PressureDiff *PDiff) { - // Check for the top of the analyzable region. - if (CurrPos == MBB->begin()) { - closeRegion(); - return false; - } + assert(CurrPos != MBB->begin()); if (!isBottomClosed()) closeBottom(); @@ -492,11 +504,8 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, do --CurrPos; while (CurrPos != MBB->begin() && CurrPos->isDebugValue()); + assert(!CurrPos->isDebugValue()); - if (CurrPos->isDebugValue()) { - closeRegion(); - return false; - } SlotIndex SlotIdx; if (RequireIntervals) SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); @@ -505,8 +514,8 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, if (RequireIntervals && isTopClosed()) static_cast(P).openTop(SlotIdx); - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*CurrPos, *TRI, *MRI); if (PDiff) collectPDiff(*PDiff, RegOpers, MRI); @@ -517,8 +526,7 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, // Kill liveness at live defs. // TODO: consider earlyclobbers? - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { bool DeadDef = false; if (RequireIntervals) { const LiveRange *LR = getLiveRange(Reg); @@ -542,8 +550,7 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, } // Generate liveness for uses. - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (!LiveRegs.contains(Reg)) { // Adjust liveouts if LiveIntervals are available. if (RequireIntervals) { @@ -561,24 +568,18 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, } } if (TrackUntiedDefs) { - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg)) UntiedDefs.insert(Reg); } } - return true; } /// Advance across the current instruction. -bool RegPressureTracker::advance() { +void RegPressureTracker::advance() { assert(!TrackUntiedDefs && "unsupported mode"); - // Check for the bottom of the analyzable region. - if (CurrPos == MBB->end()) { - closeRegion(); - return false; - } + assert(CurrPos != MBB->end()); if (!isTopClosed()) closeTop(); @@ -594,11 +595,10 @@ bool RegPressureTracker::advance() { static_cast(P).openBottom(CurrPos); } - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*CurrPos, *TRI, *MRI); - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { // Discover live-ins. bool isLive = LiveRegs.contains(Reg); if (!isLive) @@ -608,22 +608,19 @@ bool RegPressureTracker::advance() { if (RequireIntervals) { const LiveRange *LR = getLiveRange(Reg); lastUse = LR && LR->Query(SlotIdx).isKill(); - } - else { + } else { // Allocatable physregs are always single-use before register rewriting. lastUse = !TargetRegisterInfo::isVirtualRegister(Reg); } if (lastUse && isLive) { LiveRegs.erase(Reg); decreaseRegPressure(Reg); - } - else if (!lastUse && !isLive) + } else if (!lastUse && !isLive) increaseRegPressure(Reg); } // Generate liveness for defs. - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { if (LiveRegs.insert(Reg)) increaseRegPressure(Reg); } @@ -636,7 +633,6 @@ bool RegPressureTracker::advance() { do ++CurrPos; while (CurrPos != MBB->end() && CurrPos->isDebugValue()); - return true; } /// Find the max change in excess pressure across all sets. @@ -662,8 +658,7 @@ static void computeExcessPressureDelta(ArrayRef OldPressureVec, PDiff = 0; // Under the limit else PDiff = PNew - Limit; // Just exceeded limit. - } - else if (Limit > PNew) + } else if (Limit > PNew) PDiff = Limit - POld; // Just obeyed limit. if (PDiff) { @@ -728,17 +723,12 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true); - collectOperands(MI, RegOpers); - - // Boost max pressure for all dead defs together. - // Since CurrSetPressure and MaxSetPressure - increaseRegPressure(RegOpers.DeadDefs); - decreaseRegPressure(RegOpers.DeadDefs); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true); + assert(RegOpers.DeadDefs.size() == 0); // Kill liveness at live defs. - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { bool DeadDef = false; if (RequireIntervals) { const LiveRange *LR = getLiveRange(Reg); @@ -754,8 +744,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { } } // Generate liveness for uses. - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (!LiveRegs.contains(Reg)) increaseRegPressure(Reg); } @@ -923,8 +912,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI); - collectOperands(MI, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI); // Kill liveness at last uses. Assume allocatable physregs are single-use // rather than checking LiveIntervals. @@ -932,8 +921,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { if (RequireIntervals) SlotIdx = LIS->getInstructionIndex(MI).getRegSlot(); - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (RequireIntervals) { // FIXME: allow the caller to pass in the list of vreg uses that remain // to be bottom-scheduled to avoid searching uses at each query. @@ -944,8 +932,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS)) decreaseRegPressure(Reg); } - } - else if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) { // Allocatable physregs are always single-use before register rewriting. decreaseRegPressure(Reg); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c5810525f3c7..8238cdeb59ca 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1213,6 +1213,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::STACKSAVE: Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other); break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getValueType(0)); + break; case ISD::VAARG: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); @@ -3295,6 +3299,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Node->getOperand(0)); } break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0))); + Results.push_back(Results[0].getValue(0)); + break; case ISD::FCOPYSIGN: Results.push_back(ExpandFCOPYSIGN(Node)); break; diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index e9bd52034ffd..78985e01ef9a 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -141,8 +141,8 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes { /// that are "live". These nodes must be scheduled before any other nodes that /// modifies the registers can be scheduled. unsigned NumLiveRegs; - std::vector LiveRegDefs; - std::vector LiveRegGens; + std::unique_ptr LiveRegDefs; + std::unique_ptr LiveRegGens; // Collect interferences between physical register use/defs. // Each interference is an SUnit and set of physical registers. @@ -328,8 +328,8 @@ void ScheduleDAGRRList::Schedule() { NumLiveRegs = 0; // Allocate slots for each physical register, plus one for a special register // to track the virtual resource of a calling sequence. - LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr); - LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr); + LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]()); + LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]()); CallSeqEndForStart.clear(); assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences"); @@ -1218,7 +1218,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg, /// CheckForLiveRegDef - Return true and update live register vector if the /// specified register def of the specified SUnit clobbers any "live" registers. static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, - std::vector &LiveRegDefs, + SUnit **LiveRegDefs, SmallSet &RegAdded, SmallVectorImpl &LRegs, const TargetRegisterInfo *TRI) { @@ -1240,7 +1240,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, /// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered /// by RegMask, and add them to LRegs. static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask, - std::vector &LiveRegDefs, + ArrayRef LiveRegDefs, SmallSet &RegAdded, SmallVectorImpl &LRegs) { // Look at all live registers. Skip Reg0 and the special CallResource. @@ -1278,7 +1278,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl &LRegs) { for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU) - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs, + CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(), RegAdded, LRegs, TRI); } @@ -1302,7 +1302,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl &LRegs) { for (; NumVals; --NumVals, ++i) { unsigned Reg = cast(Node->getOperand(i))->getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) - CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI); + CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); } } else i += NumVals; @@ -1328,13 +1328,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl &LRegs) { } } if (const uint32_t *RegMask = getNodeRegMask(Node)) - CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs); + CheckForLiveRegDefMasked(SU, RegMask, + makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()), + RegAdded, LRegs); const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); if (!MCID.ImplicitDefs) continue; for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) - CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI); + CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); } return !LRegs.empty(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f754e24e3231..85e7e3c1bc8c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3301,18 +3301,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { // extract the spalt value and use it as a uniform base. // In all other cases the function returns 'false'. // -static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index, +static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index, SelectionDAGBuilder* SDB) { SelectionDAG& DAG = SDB->DAG; LLVMContext &Context = *DAG.getContext(); assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type"); - GetElementPtrInst *GEP = dyn_cast(Ptr); + const GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP || GEP->getNumOperands() > 2) return false; - Value *GEPPtr = GEP->getPointerOperand(); + const Value *GEPPtr = GEP->getPointerOperand(); if (!GEPPtr->getType()->isVectorTy()) Ptr = GEPPtr; else if (!(Ptr = getSplatValue(GEPPtr))) @@ -3348,7 +3348,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask) - Value *Ptr = I.getArgOperand(1); + const Value *Ptr = I.getArgOperand(1); SDValue Src0 = getValue(I.getArgOperand(0)); SDValue Mask = getValue(I.getArgOperand(3)); EVT VT = Src0.getValueType(); @@ -3362,10 +3362,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); - Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; + const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(MemOpBasePtr), MachineMemOperand::MOStore, VT.getStoreSize(), @@ -3425,7 +3425,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) - Value *Ptr = I.getArgOperand(0); + const Value *Ptr = I.getArgOperand(0); SDValue Src0 = getValue(I.getArgOperand(3)); SDValue Mask = getValue(I.getArgOperand(2)); @@ -3442,7 +3442,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDValue Root = DAG.getRoot(); SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && @@ -4463,22 +4463,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { Address = BCI->getOperand(0); // Parameters are handled specially. bool isParameter = Variable->isParameter() || isa(Address); - - const AllocaInst *AI = dyn_cast(Address); - - if (isParameter && !AI) { - FrameIndexSDNode *FINode = dyn_cast(N.getNode()); - if (FINode) - // Byval parameter. We have a frame index at this point. - SDV = DAG.getFrameIndexDbgValue( - Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder); - else { - // Address is an argument, so try to emit its dbg value using - // virtual register info from the FuncInfo.ValueMap. - EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, - N); - return nullptr; - } + auto FINode = dyn_cast(N.getNode()); + if (isParameter && FINode) { + // Byval parameter. We have a frame index at this point. + SDV = DAG.getFrameIndexDbgValue(Variable, Expression, + FINode->getIndex(), 0, dl, SDNodeOrder); + } else if (isa(Address)) { + // Address is an argument, so try to emit its dbg value using + // virtual register info from the FuncInfo.ValueMap. + EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, + N); + return nullptr; } else { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), true, 0, dl, SDNodeOrder); @@ -4933,6 +4928,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res)); return nullptr; } + case Intrinsic::get_dynamic_area_offset: { + SDValue Op = getRoot(); + EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout()); + EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + // Result type for @llvm.get.dynamic.area.offset should match PtrTy for + // target. + if (PtrTy != ResTy) + report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset" + " intrinsic!"); + Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy), + Op); + DAG.setRoot(Op); + setValue(&I, Res); + return nullptr; + } case Intrinsic::stackprotector: { // Emit code into the DAG to store the stack guard onto the stack. MachineFunction &MF = DAG.getMachineFunction(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 195b48498605..a6f9699bb29c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::LIFETIME_END: return "lifetime.end"; case ISD::GC_TRANSITION_START: return "gc_transition.start"; case ISD::GC_TRANSITION_END: return "gc_transition.end"; + case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation case ISD::BITREVERSE: return "bitreverse"; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index ebf071cb9946..f6c5d90f47ae 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -19,7 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" @@ -264,13 +264,17 @@ namespace llvm { return; IS.OptLevel = NewOptLevel; IS.TM.setOptLevel(NewOptLevel); - SavedFastISel = IS.TM.Options.EnableFastISel; - if (NewOptLevel == CodeGenOpt::None) - IS.TM.setFastISel(true); DEBUG(dbgs() << "\nChanging optimization level for Function " << IS.MF->getFunction()->getName() << "\n"); DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O" << NewOptLevel << "\n"); + SavedFastISel = IS.TM.Options.EnableFastISel; + if (NewOptLevel == CodeGenOpt::None) { + IS.TM.setFastISel(IS.TM.getO0WantsFastISel()); + DEBUG(dbgs() << "\tFastISel is " + << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled") + << "\n"); + } } ~OptLevelChanger() { diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index ff86dabfac59..1f5b54866ac6 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, if (PredTBB) TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB); + auto Prob = MBPI->getEdgeProbability(PredBB, TailBB); PredBB->removeSuccessor(TailBB); unsigned NumSuccessors = PredBB->succ_size(); assert(NumSuccessors <= 1); if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget) - PredBB->addSuccessor(NewTarget, Weight); + PredBB->addSuccessor(NewTarget, Prob); TDBBs.push_back(PredBB); } @@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) - PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I)); + PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I)); Changed = true; ++NumTailDups; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index e348095aa8fc..69c130809bb8 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -840,6 +840,9 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand); } + + // For most targets @llvm.get.dynamic.area.offest just returns 0. + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index c96c813b0c9b..c6bae2434586 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -83,21 +83,20 @@ class TwoAddressInstructionPass : public MachineFunctionPass { // The current basic block being processed. MachineBasicBlock *MBB; - // DistanceMap - Keep track the distance of a MI from the start of the - // current basic block. + // Keep track the distance of a MI from the start of the current basic block. DenseMap DistanceMap; // Set of already processed instructions in the current block. SmallPtrSet Processed; - // SrcRegMap - A map from virtual registers to physical registers which are - // likely targets to be coalesced to due to copies from physical registers to - // virtual registers. e.g. v1024 = move r0. + // A map from virtual registers to physical registers which are likely targets + // to be coalesced to due to copies from physical registers to virtual + // registers. e.g. v1024 = move r0. DenseMap SrcRegMap; - // DstRegMap - A map from virtual registers to physical registers which are - // likely targets to be coalesced to due to copies to physical registers from - // virtual registers. e.g. r1 = move v1024. + // A map from virtual registers to physical registers which are likely targets + // to be coalesced to due to copies to physical registers from virtual + // registers. e.g. r1 = move v1024. DenseMap DstRegMap; bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg, @@ -165,7 +164,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunctionPass::getAnalysisUsage(AU); } - /// runOnMachineFunction - Pass entry point. + /// Pass entry point. bool runOnMachineFunction(MachineFunction&) override; }; } // end anonymous namespace @@ -181,10 +180,9 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS); -/// sink3AddrInstruction - A two-address instruction has been converted to a -/// three-address instruction to avoid clobbering a register. Try to sink it -/// past the instruction that would kill the above mentioned register to reduce -/// register pressure. +/// A two-address instruction has been converted to a three-address instruction +/// to avoid clobbering a register. Try to sink it past the instruction that +/// would kill the above mentioned register to reduce register pressure. bool TwoAddressInstructionPass:: sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, MachineBasicBlock::iterator OldPos) { @@ -313,8 +311,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, return true; } -/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg -/// in current BB. +/// Return the MachineInstr* if it is the single def of the Reg in current BB. static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB, const MachineRegisterInfo *MRI) { MachineInstr *Ret = nullptr; @@ -352,10 +349,10 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg, return false; } -/// noUseAfterLastDef - Return true if there are no intervening uses between the -/// last instruction in the MBB that defines the specified register and the -/// two-address instruction which is being processed. It also returns the last -/// def location by reference +/// Return true if there are no intervening uses between the last instruction +/// in the MBB that defines the specified register and the two-address +/// instruction which is being processed. It also returns the last def location +/// by reference. bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef) { LastDef = 0; @@ -376,9 +373,9 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist, return !(LastUse > LastDef && LastUse < Dist); } -/// isCopyToReg - Return true if the specified MI is a copy instruction or -/// a extract_subreg instruction. It also returns the source and destination -/// registers and whether they are physical registers by reference. +/// Return true if the specified MI is a copy instruction or an extract_subreg +/// instruction. It also returns the source and destination registers and +/// whether they are physical registers by reference. static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII, unsigned &SrcReg, unsigned &DstReg, bool &IsSrcPhys, bool &IsDstPhys) { @@ -398,8 +395,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII, return true; } -/// isPLainlyKilled - Test if the given register value, which is used by the -// given instruction, is killed by the given instruction. +/// Test if the given register value, which is used by the +/// given instruction, is killed by the given instruction. static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS) { if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) && @@ -425,7 +422,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, return MI->killsRegister(Reg); } -/// isKilled - Test if the given register value, which is used by the given +/// Test if the given register value, which is used by the given /// instruction, is killed by the given instruction. This looks through /// coalescable copies to see if the original value is potentially not killed. /// @@ -473,8 +470,8 @@ static bool isKilled(MachineInstr &MI, unsigned Reg, } } -/// isTwoAddrUse - Return true if the specified MI uses the specified register -/// as a two-address use. If so, return the destination register by reference. +/// Return true if the specified MI uses the specified register as a two-address +/// use. If so, return the destination register by reference. static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) { const MachineOperand &MO = MI.getOperand(i); @@ -489,8 +486,8 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { return false; } -/// findOnlyInterestingUse - Given a register, if has a single in-basic block -/// use, return the use instruction if it's a copy or a two-address use. +/// Given a register, if has a single in-basic block use, return the use +/// instruction if it's a copy or a two-address use. static MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB, MachineRegisterInfo *MRI, @@ -517,8 +514,8 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB, return nullptr; } -/// getMappedReg - Return the physical register the specified virtual register -/// might be mapped to. +/// Return the physical register the specified virtual register might be mapped +/// to. static unsigned getMappedReg(unsigned Reg, DenseMap &RegMap) { while (TargetRegisterInfo::isVirtualRegister(Reg)) { @@ -532,8 +529,7 @@ getMappedReg(unsigned Reg, DenseMap &RegMap) { return 0; } -/// regsAreCompatible - Return true if the two registers are equal or aliased. -/// +/// Return true if the two registers are equal or aliased. static bool regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { if (RegA == RegB) @@ -544,8 +540,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { } -/// isProfitableToCommute - Return true if it's potentially profitable to commute -/// the two-address instruction that's being processed. +/// Return true if it's potentially profitable to commute the two-address +/// instruction that's being processed. bool TwoAddressInstructionPass:: isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, @@ -643,9 +639,8 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, return LastDefB && LastDefC && LastDefC > LastDefB; } -/// commuteInstruction - Commute a two-address instruction and update the basic -/// block, distance map, and live variables if needed. Return true if it is -/// successful. +/// Commute a two-address instruction and update the basic block, distance map, +/// and live variables if needed. Return true if it is successful. bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI, unsigned RegBIdx, unsigned RegCIdx, @@ -674,8 +669,8 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI, return true; } -/// isProfitableToConv3Addr - Return true if it is profitable to convert the -/// given 2-address instruction to a 3-address one. +/// Return true if it is profitable to convert the given 2-address instruction +/// to a 3-address one. bool TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){ // Look for situations like this: @@ -691,8 +686,8 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){ return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI)); } -/// convertInstTo3Addr - Convert the specified two-address instruction into a -/// three address one. Return true if this transformation was successful. +/// Convert the specified two-address instruction into a three address one. +/// Return true if this transformation was successful. bool TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -733,8 +728,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi, return true; } -/// scanUses - Scan forward recursively for only uses, update maps if the use -/// is a copy or a two-address instruction. +/// Scan forward recursively for only uses, update maps if the use is a copy or +/// a two-address instruction. void TwoAddressInstructionPass::scanUses(unsigned DstReg) { SmallVector VirtRegPairs; @@ -780,8 +775,8 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) { } } -/// processCopy - If the specified instruction is not yet processed, process it -/// if it's a copy. For a copy instruction, we find the physical registers the +/// If the specified instruction is not yet processed, process it if it's a +/// copy. For a copy instruction, we find the physical registers the /// source and destination registers might be mapped to. These are kept in /// point-to maps used to determine future optimizations. e.g. /// v1024 = mov r0 @@ -816,9 +811,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) { return; } -/// rescheduleMIBelowKill - If there is one more local instruction that reads -/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill -/// instruction in order to eliminate the need for the copy. +/// If there is one more local instruction that reads 'Reg' and it kills 'Reg, +/// consider moving the instruction below the kill instruction in order to +/// eliminate the need for the copy. bool TwoAddressInstructionPass:: rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -874,8 +869,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, SmallSet Uses; SmallSet Kills; SmallSet Defs; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -917,8 +911,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, OtherMI->isBranch() || OtherMI->isTerminator()) // Don't move pass calls, etc. return false; - for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = OtherMI->getOperand(i); + for (const MachineOperand &MO : OtherMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -987,8 +980,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, return true; } -/// isDefTooClose - Return true if the re-scheduling will put the given -/// instruction too close to the defs of its register dependencies. +/// Return true if the re-scheduling will put the given instruction too close +/// to the defs of its register dependencies. bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) { @@ -1007,10 +1000,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, return false; } -/// rescheduleKillAboveMI - If there is one more local instruction that reads -/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the -/// current two-address instruction in order to eliminate the need for the -/// copy. +/// If there is one more local instruction that reads 'Reg' and it kills 'Reg, +/// consider moving the kill instruction above the current two-address +/// instruction in order to eliminate the need for the copy. bool TwoAddressInstructionPass:: rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -1058,8 +1050,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, SmallSet Kills; SmallSet Defs; SmallSet LiveDefs; - for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = KillMI->getOperand(i); + for (const MachineOperand &MO : KillMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -1097,8 +1088,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, // Don't move pass calls, etc. return false; SmallVector OtherDefs; - for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = OtherMI->getOperand(i); + for (const MachineOperand &MO : OtherMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -1181,7 +1171,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, unsigned OtherOpIdx = MI->getDesc().getNumDefs(); for (; OtherOpIdx < OpsNum; OtherOpIdx++) { // The call of findCommutedOpIndices below only checks if BaseOpIdx - // and OtherOpIdx are commutable, it does not really searches for + // and OtherOpIdx are commutable, it does not really search for // other commutable operands and does not change the values of passed // variables. if (OtherOpIdx == BaseOpIdx || @@ -1213,13 +1203,13 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, return false; } -/// tryInstructionTransform - For the case where an instruction has a single -/// pair of tied register operands, attempt some transformations that may -/// either eliminate the tied operands or improve the opportunities for -/// coalescing away the register copy. Returns true if no copy needs to be -/// inserted to untie mi's operands (either because they were untied, or -/// because mi was rescheduled, and will be visited again later). If the -/// shouldOnlyCommute flag is true, only instruction commutation is attempted. +/// For the case where an instruction has a single pair of tied register +/// operands, attempt some transformations that may either eliminate the tied +/// operands or improve the opportunities for coalescing away the register copy. +/// Returns true if no copy needs to be inserted to untie mi's operands +/// (either because they were untied, or because mi was rescheduled, and will +/// be visited again later). If the shouldOnlyCommute flag is true, only +/// instruction commutation is attempted. bool TwoAddressInstructionPass:: tryInstructionTransform(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -1570,8 +1560,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, if (AllUsesCopied) { if (!IsEarlyClobber) { // Replace other (un-tied) uses of regB with LastCopiedReg. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB && MO.isUse()) { if (MO.isKill()) { @@ -1608,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, // regB is still used in this instruction, but a kill flag was // removed from a different tied use of regB, so now we need to add // a kill flag to one of the remaining uses of regB. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) { MO.setIsKill(true); break; @@ -1618,8 +1606,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, } } -/// runOnMachineFunction - Reduce two-address instructions to two operands. -/// +/// Reduce two-address instructions to two operands. bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const TargetMachine &TM = MF->getTarget(); diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index f3f4e3be389e..dee4b870434e 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -19,7 +19,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 2165d353ba09..a4195b75c47d 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -163,20 +163,12 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) { if (DumpType == DIDT_All || DumpType == DIDT_CUIndex) { OS << "\n.debug_cu_index contents:\n"; - DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), - savedAddressByteSize); - DWARFUnitIndex CUIndex; - if (CUIndex.parse(CUIndexData)) - CUIndex.dump(OS); + getCUIndex().dump(OS); } if (DumpType == DIDT_All || DumpType == DIDT_TUIndex) { OS << "\n.debug_tu_index contents:\n"; - DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), - savedAddressByteSize); - DWARFUnitIndex TUIndex; - if (TUIndex.parse(TUIndexData)) - TUIndex.dump(OS); + getTUIndex().dump(OS); } if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) { @@ -280,7 +272,7 @@ const DWARFUnitIndex &DWARFContext::getCUIndex() { DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0); - CUIndex = llvm::make_unique(); + CUIndex = llvm::make_unique(DW_SECT_INFO); CUIndex->parse(CUIndexData); return *CUIndex; } @@ -291,7 +283,7 @@ const DWARFUnitIndex &DWARFContext::getTUIndex() { DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0); - TUIndex = llvm::make_unique(); + TUIndex = llvm::make_unique(DW_SECT_TYPES); TUIndex->parse(TUIndexData); return *TUIndex; } diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp index baefed30f368..1f1921649b57 100644 --- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp @@ -30,6 +30,18 @@ void DWARFUnitIndex::Header::dump(raw_ostream &OS) const { } bool DWARFUnitIndex::parse(DataExtractor IndexData) { + bool b = parseImpl(IndexData); + if (!b) { + // Make sure we don't try to dump anything + Header.NumBuckets = 0; + // Release any partially initialized data. + ColumnKinds.reset(); + Rows.reset(); + } + return b; +} + +bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) { uint32_t Offset = 0; if (!Header.parse(IndexData, &Offset)) return false; @@ -62,7 +74,7 @@ bool DWARFUnitIndex::parse(DataExtractor IndexData) { // Read the Column Headers for (unsigned i = 0; i != Header.NumColumns; ++i) { ColumnKinds[i] = static_cast(IndexData.getU32(&Offset)); - if (ColumnKinds[i] == DW_SECT_INFO || ColumnKinds[i] == DW_SECT_TYPES) { + if (ColumnKinds[i] == InfoColumnKind) { if (InfoColumn != -1) return false; InfoColumn = i; @@ -107,6 +119,9 @@ StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) { } void DWARFUnitIndex::dump(raw_ostream &OS) const { + if (!Header.NumBuckets) + return; + Header.dump(OS); OS << "Index Signature "; for (unsigned i = 0; i != Header.NumColumns; ++i) diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp index 7aea169b7ae9..9c52a4dbe774 100644 --- a/lib/Fuzzer/FuzzerLoop.cpp +++ b/lib/Fuzzer/FuzzerLoop.cpp @@ -238,7 +238,11 @@ void Fuzzer::RunOneAndUpdateCorpus(Unit &U) { } void Fuzzer::ExecuteCallback(const Unit &U) { - int Res = USF.TargetFunction(U.data(), U.size()); + const uint8_t *Data = U.data(); + uint8_t EmptyData; + if (!Data) + Data = &EmptyData; + int Res = USF.TargetFunction(Data, U.size()); (void)Res; assert(Res == 0); } diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt index 1e02af149ad8..85e8706f11cc 100644 --- a/lib/Fuzzer/test/CMakeLists.txt +++ b/lib/Fuzzer/test/CMakeLists.txt @@ -38,6 +38,9 @@ set(UninstrumentedTests UninstrumentedTest ) +set(TraceBBTests + SimpleTest + ) set(TestBinaries) @@ -99,6 +102,11 @@ foreach(Test ${UninstrumentedTests}) set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-Uninstrumented) endforeach() +add_subdirectory(trace-bb) + +foreach(Test ${TraceBBTests}) + set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-TraceBB) +endforeach() set_target_properties(${TestBinaries} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} diff --git a/lib/Fuzzer/test/SimpleTest.cpp b/lib/Fuzzer/test/SimpleTest.cpp index 6811d115d960..04225a889f5d 100644 --- a/lib/Fuzzer/test/SimpleTest.cpp +++ b/lib/Fuzzer/test/SimpleTest.cpp @@ -1,4 +1,5 @@ // Simple test for a fuzzer. The fuzzer must find the string "Hi!". +#include #include #include #include @@ -7,6 +8,7 @@ static volatile int Sink; extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + assert(Data); if (Size > 0 && Data[0] == 'H') { Sink = 1; if (Size > 1 && Data[1] == 'i') { diff --git a/lib/Fuzzer/test/trace-bb/CMakeLists.txt b/lib/Fuzzer/test/trace-bb/CMakeLists.txt new file mode 100644 index 000000000000..99af019565b5 --- /dev/null +++ b/lib/Fuzzer/test/trace-bb/CMakeLists.txt @@ -0,0 +1,14 @@ +# These tests are not instrumented with coverage. + +set(CMAKE_CXX_FLAGS_RELEASE + "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=edge,trace-bb") + +foreach(Test ${TraceBBTests}) + add_executable(LLVMFuzzer-${Test}-TraceBB + ../${Test}.cpp + ) + target_link_libraries(LLVMFuzzer-${Test}-TraceBB + LLVMFuzzer + ) +endforeach() + diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index bdefe5917fef..e9626ba7e3bc 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -770,6 +770,36 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index, return addAttributes(C, Index, AttributeSet::get(C, Index, B)); } +AttributeSet AttributeSet::addAttribute(LLVMContext &C, + ArrayRef Indices, + Attribute A) const { + unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0; + auto IdxI = Indices.begin(), IdxE = Indices.end(); + SmallVector AttrSet; + + while (I != E && IdxI != IdxE) { + if (getSlotIndex(I) < *IdxI) + AttrSet.emplace_back(getSlotAttributes(I++)); + else if (getSlotIndex(I) > *IdxI) + AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A))); + else { + AttrBuilder B(getSlotAttributes(I), *IdxI); + B.addAttribute(A); + AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B)); + ++I; + ++IdxI; + } + } + + while (I != E) + AttrSet.emplace_back(getSlotAttributes(I++)); + + while (IdxI != IdxE) + AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A))); + + return get(C, AttrSet); +} + AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index, AttributeSet Attrs) const { if (!pImpl) return Attrs; diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index d79fb3e7b0ba..b4a07a1b6b4a 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -857,6 +857,57 @@ static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) { return true; } +template +static Constant *getIntSequenceIfElementsMatch(ArrayRef V) { + assert(!V.empty() && "Cannot get empty int sequence."); + + SmallVector Elts; + for (Constant *C : V) + if (auto *CI = dyn_cast(C)) + Elts.push_back(CI->getZExtValue()); + else + return nullptr; + return SequentialTy::get(V[0]->getContext(), Elts); +} + +template +static Constant *getFPSequenceIfElementsMatch(ArrayRef V) { + assert(!V.empty() && "Cannot get empty FP sequence."); + + SmallVector Elts; + for (Constant *C : V) + if (auto *CFP = dyn_cast(C)) + Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); + else + return nullptr; + return SequentialTy::getFP(V[0]->getContext(), Elts); +} + +template +static Constant *getSequenceIfElementsMatch(Constant *C, + ArrayRef V) { + // We speculatively build the elements here even if it turns out that there is + // a constantexpr or something else weird, since it is so uncommon for that to + // happen. + if (ConstantInt *CI = dyn_cast(C)) { + if (CI->getType()->isIntegerTy(8)) + return getIntSequenceIfElementsMatch(V); + else if (CI->getType()->isIntegerTy(16)) + return getIntSequenceIfElementsMatch(V); + else if (CI->getType()->isIntegerTy(32)) + return getIntSequenceIfElementsMatch(V); + else if (CI->getType()->isIntegerTy(64)) + return getIntSequenceIfElementsMatch(V); + } else if (ConstantFP *CFP = dyn_cast(C)) { + if (CFP->getType()->isFloatTy()) + return getFPSequenceIfElementsMatch(V); + else if (CFP->getType()->isDoubleTy()) + return getFPSequenceIfElementsMatch(V); + } + + return nullptr; +} + ConstantArray::ConstantArray(ArrayType *T, ArrayRef V) : Constant(T, ConstantArrayVal, OperandTraits::op_end(this) - V.size(), @@ -874,6 +925,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef V) { return C; return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V); } + Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef V) { // Empty arrays are canonicalized to ConstantAggregateZero. if (V.empty()) @@ -896,74 +948,8 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef V) { // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. - if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { - // We speculatively build the elements here even if it turns out that there - // is a constantexpr or something else weird in the array, since it is so - // uncommon for that to happen. - if (ConstantInt *CI = dyn_cast(C)) { - if (CI->getType()->isIntegerTy(8)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(16)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(32)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(64)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } - } - - if (ConstantFP *CFP = dyn_cast(C)) { - if (CFP->getType()->isFloatTy()) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::getFP(C->getContext(), Elts); - } else if (CFP->getType()->isDoubleTy()) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::getFP(C->getContext(), Elts); - } - } - } + if (ConstantDataSequential::isElementTypeCompatible(C->getType())) + return getSequenceIfElementsMatch(C, V); // Otherwise, we really do want to create a ConstantArray. return nullptr; @@ -1059,6 +1045,7 @@ Constant *ConstantVector::get(ArrayRef V) { VectorType *Ty = VectorType::get(V.front()->getType(), V.size()); return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V); } + Constant *ConstantVector::getImpl(ArrayRef V) { assert(!V.empty() && "Vectors can't be empty"); VectorType *T = VectorType::get(V.front()->getType(), V.size()); @@ -1084,74 +1071,8 @@ Constant *ConstantVector::getImpl(ArrayRef V) { // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. - if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { - // We speculatively build the elements here even if it turns out that there - // is a constantexpr or something else weird in the array, since it is so - // uncommon for that to happen. - if (ConstantInt *CI = dyn_cast(C)) { - if (CI->getType()->isIntegerTy(8)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(16)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(32)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(64)) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } - } - - if (ConstantFP *CFP = dyn_cast(C)) { - if (CFP->getType()->isFloatTy()) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::getFP(C->getContext(), Elts); - } else if (CFP->getType()->isDoubleTy()) { - SmallVector Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::getFP(C->getContext(), Elts); - } - } - } + if (ConstantDataSequential::isElementTypeCompatible(C->getType())) + return getSequenceIfElementsMatch(C, V); // Otherwise, the element type isn't compatible with ConstantDataVector, or // the operand list constants a ConstantExpr or something else strange. diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 97e2a84a5cef..5e4d2d2054eb 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -411,12 +411,14 @@ void Function::clearGC() { } } -/// copyAttributesFrom - copy all additional attributes (those not needed to -/// create a Function) from the Function Src to this one. +/// Copy all additional attributes (those not needed to create a Function) from +/// the Function Src to this one. void Function::copyAttributesFrom(const GlobalValue *Src) { - assert(isa(Src) && "Expected a Function!"); GlobalObject::copyAttributesFrom(Src); - const Function *SrcF = cast(Src); + const Function *SrcF = dyn_cast(Src); + if (!SrcF) + return; + setCallingConv(SrcF->getCallingConv()); setAttributes(SrcF->getAttributes()); if (SrcF->hasGC()) diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp index 8fde4b8e9d77..c538c7baa1fe 100644 --- a/lib/IR/Globals.cpp +++ b/lib/IR/Globals.cpp @@ -97,10 +97,11 @@ void GlobalObject::setGlobalObjectSubClassData(unsigned Val) { } void GlobalObject::copyAttributesFrom(const GlobalValue *Src) { - const auto *GV = cast(Src); - GlobalValue::copyAttributesFrom(GV); - setAlignment(GV->getAlignment()); - setSection(GV->getSection()); + GlobalValue::copyAttributesFrom(Src); + if (const auto *GV = dyn_cast(Src)) { + setAlignment(GV->getAlignment()); + setSection(GV->getSection()); + } } const char *GlobalValue::getSection() const { @@ -216,14 +217,14 @@ void GlobalVariable::setInitializer(Constant *InitVal) { } } -/// copyAttributesFrom - copy all additional attributes (those not needed to -/// create a GlobalVariable) from the GlobalVariable Src to this one. +/// Copy all additional attributes (those not needed to create a GlobalVariable) +/// from the GlobalVariable Src to this one. void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) { - assert(isa(Src) && "Expected a GlobalVariable!"); GlobalObject::copyAttributesFrom(Src); - const GlobalVariable *SrcVar = cast(Src); - setThreadLocalMode(SrcVar->getThreadLocalMode()); - setExternallyInitialized(SrcVar->isExternallyInitialized()); + if (const GlobalVariable *SrcVar = dyn_cast(Src)) { + setThreadLocalMode(SrcVar->getThreadLocalMode()); + setExternallyInitialized(SrcVar->isExternallyInitialized()); + } } diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 617d965f4cfc..5cbb597ca269 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -95,6 +95,12 @@ struct VerifierSupport { Write(&*I); } + void Write(const Module *M) { + if (!M) + return; + OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n"; + } + void Write(const Value *V) { if (!V) return; @@ -1721,7 +1727,8 @@ void Verifier::visitFunction(const Function &F) { auto *Per = dyn_cast(F.getPersonalityFn()->stripPointerCasts()); if (Per) Assert(Per->getParent() == F.getParent(), - "Referencing personality function in another module!", &F, Per); + "Referencing personality function in another module!", + &F, F.getParent(), Per, Per->getParent()); } if (F.isMaterializable()) { @@ -3165,7 +3172,7 @@ void Verifier::visitInstruction(Instruction &I) { " donothing or patchpoint", &I); Assert(F->getParent() == M, "Referencing function in another module!", - &I); + &I, M, F, F->getParent()); } else if (BasicBlock *OpBB = dyn_cast(I.getOperand(i))) { Assert(OpBB->getParent() == BB->getParent(), "Referring to a basic block in another function!", &I); @@ -3173,7 +3180,7 @@ void Verifier::visitInstruction(Instruction &I) { Assert(OpArg->getParent() == BB->getParent(), "Referring to an argument in another function!", &I); } else if (GlobalValue *GV = dyn_cast(I.getOperand(i))) { - Assert(GV->getParent() == M, "Referencing global in another module!", &I); + Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent()); } else if (isa(I.getOperand(i))) { verifyDominatesUse(I, i); } else if (isa(I.getOperand(i))) { diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 931bcf0d23fc..468ec24e3a06 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -67,14 +67,14 @@ const char* LTOCodeGenerator::getVersionString() { LTOCodeGenerator::LTOCodeGenerator() : Context(getGlobalContext()), MergedModule(new Module("ld-temp.o", Context)), - IRLinker(MergedModule.get()) { + IRLinker(new Linker(*MergedModule)) { initializeLTOPasses(); } LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr Context) : OwnedContext(std::move(Context)), Context(*OwnedContext), MergedModule(new Module("ld-temp.o", *OwnedContext)), - IRLinker(MergedModule.get()) { + IRLinker(new Linker(*MergedModule)) { initializeLTOPasses(); } @@ -114,7 +114,7 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) { assert(&Mod->getModule().getContext() == &Context && "Expected module in same context"); - bool ret = IRLinker.linkInModule(&Mod->getModule()); + bool ret = IRLinker->linkInModule(Mod->getModule()); const std::vector &undefs = Mod->getAsmUndefinedRefs(); for (int i = 0, e = undefs.size(); i != e; ++i) @@ -130,7 +130,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr Mod) { AsmUndefinedRefs.clear(); MergedModule = Mod->takeModule(); - IRLinker.setModule(MergedModule.get()); + IRLinker = make_unique(*MergedModule); const std::vector &Undefs = Mod->getAsmUndefinedRefs(); for (int I = 0, E = Undefs.size(); I != E; ++I) diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 6b60379803e1..67613967f490 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -13,28 +13,18 @@ #include "llvm/Linker/Linker.h" #include "llvm-c/Linker.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/TypeFinder.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" -#include -#include using namespace llvm; - //===----------------------------------------------------------------------===// // TypeMap implementation. //===----------------------------------------------------------------------===// @@ -42,22 +32,22 @@ using namespace llvm; namespace { class TypeMapTy : public ValueMapTypeRemapper { /// This is a mapping from a source type to a destination type to use. - DenseMap MappedTypes; + DenseMap MappedTypes; /// When checking to see if two subgraphs are isomorphic, we speculatively /// add types to MappedTypes, but keep track of them here in case we need to /// roll back. - SmallVector SpeculativeTypes; + SmallVector SpeculativeTypes; - SmallVector SpeculativeDstOpaqueTypes; + SmallVector SpeculativeDstOpaqueTypes; /// This is a list of non-opaque structs in the source module that are mapped /// to an opaque struct in the destination module. - SmallVector SrcDefinitionsToResolve; + SmallVector SrcDefinitionsToResolve; /// This is the set of opaque types in the destination modules who are /// getting a body from the source module. - SmallPtrSet DstResolvedOpaqueTypes; + SmallPtrSet DstResolvedOpaqueTypes; public: TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet) @@ -179,7 +169,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) { // Fail if any of the extra properties (e.g. array size) of the type disagree. if (isa(DstTy)) - return false; // bitwidth disagrees. + return false; // bitwidth disagrees. if (PointerType *PT = dyn_cast(DstTy)) { if (PT->getAddressSpace() != cast(SrcTy)->getAddressSpace()) return false; @@ -215,7 +205,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) { } void TypeMapTy::linkDefinedTypeBodies() { - SmallVector Elements; + SmallVector Elements; for (StructType *SrcSTy : SrcDefinitionsToResolve) { StructType *DstSTy = cast(MappedTypes[SrcSTy]); assert(DstSTy->isOpaque()); @@ -390,7 +380,8 @@ void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; } /// This is an implementation class for the LinkModules function, which is the /// entrypoint for this file. class ModuleLinker { - Module *DstM, *SrcM; + Module &DstM; + Module &SrcM; TypeMapTy TypeMap; ValueMaterializerTy ValMaterializer; @@ -401,16 +392,7 @@ class ModuleLinker { /// but this allows us to reuse the ValueMapper code. ValueToValueMapTy ValueMap; - struct AppendingVarInfo { - GlobalVariable *NewGV; // New aggregate global in dest module. - const Constant *DstInit; // Old initializer from dest module. - const Constant *SrcInit; // Old initializer from src module. - }; - - std::vector AppendingVars; - - // Set of items not to link in from source. - SmallPtrSet DoNotLinkFromSource; + SetVector ValuesToLink; DiagnosticHandlerFunction DiagnosticHandler; @@ -423,28 +405,29 @@ class ModuleLinker { /// Function to import from source module, all other functions are /// imported as declarations instead of definitions. - Function *ImportFunction; + DenseSet *ImportFunction; /// Set to true if the given FunctionInfoIndex contains any functions /// from this source module, in which case we must conservatively assume /// that any of its functions may be imported into another module /// as part of a different backend compilation process. - bool HasExportedFunctions; + bool HasExportedFunctions = false; /// Set to true when all global value body linking is complete (including /// lazy linking). Used to prevent metadata linking from creating new /// references. - bool DoneLinkingBodies; + bool DoneLinkingBodies = false; + + bool HasError = false; public: - ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM, + ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags, const FunctionInfoIndex *Index = nullptr, - Function *FuncToImport = nullptr) - : DstM(dstM), SrcM(srcM), TypeMap(Set), ValMaterializer(this), + DenseSet *FunctionsToImport = nullptr) + : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this), DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index), - ImportFunction(FuncToImport), HasExportedFunctions(false), - DoneLinkingBodies(false) { + ImportFunction(FunctionsToImport) { assert((ImportIndex || !ImportFunction) && "Expect a FunctionInfoIndex when importing"); // If we have a FunctionInfoIndex but no function to import, @@ -469,20 +452,18 @@ class ModuleLinker { /// Handles cloning of a global values from the source module into /// the destination module, including setting the attributes and visibility. GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV, - const GlobalValue *DGV = nullptr); + const GlobalValue *DGV, bool ForDefinition); /// Check if we should promote the given local value to global scope. bool doPromoteLocalToGlobal(const GlobalValue *SGV); - /// Check if all global value body linking is complete. - bool doneLinkingBodies() { return DoneLinkingBodies; } - bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest, const GlobalValue &Src); /// Helper method for setting a message and returning an error code. bool emitError(const Twine &Message) { DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message)); + HasError = true; return true; } @@ -490,7 +471,7 @@ class ModuleLinker { DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message)); } - bool getComdatLeader(Module *M, StringRef ComdatName, + bool getComdatLeader(Module &M, StringRef ComdatName, const GlobalVariable *&GVar); bool computeResultingSelectionKind(StringRef ComdatName, Comdat::SelectionKind Src, @@ -513,7 +494,7 @@ class ModuleLinker { return nullptr; // Otherwise see if we have a match in the destination module's symtab. - GlobalValue *DGV = DstM->getNamedValue(getName(SrcGV)); + GlobalValue *DGV = DstM.getNamedValue(getName(SrcGV)); if (!DGV) return nullptr; @@ -531,18 +512,17 @@ class ModuleLinker { void upgradeMismatchedGlobalArray(StringRef Name); void upgradeMismatchedGlobals(); + bool linkIfNeeded(GlobalValue &GV); bool linkAppendingVarProto(GlobalVariable *DstGV, const GlobalVariable *SrcGV); bool linkGlobalValueProto(GlobalValue *GV); bool linkModuleFlagsMetadata(); - void linkAppendingVarInit(AppendingVarInfo &AVI); - void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src); bool linkFunctionBody(Function &Dst, Function &Src); void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src); - bool linkGlobalValueBody(GlobalValue &Src); + bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src); /// Functions that take care of cloning a specific global value type /// into the destination module. @@ -601,10 +581,10 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) { // If there is a conflict, rename the conflict. if (GlobalValue *ConflictGV = M->getNamedValue(Name)) { GV->takeName(ConflictGV); - ConflictGV->setName(Name); // This will cause ConflictGV to get renamed + ConflictGV->setName(Name); // This will cause ConflictGV to get renamed assert(ConflictGV->getName() != Name && "forceRenaming didn't work"); } else { - GV->setName(Name); // Force the name back + GV->setName(Name); // Force the name back } } @@ -612,18 +592,7 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) { /// from the SrcGV to the DestGV. void ModuleLinker::copyGVAttributes(GlobalValue *NewGV, const GlobalValue *SrcGV) { - auto *GA = dyn_cast(SrcGV); - // Check for the special case of converting an alias (definition) to a - // non-alias (declaration). This can happen when we are importing and - // encounter a weak_any alias (weak_any defs may not be imported, see - // comments in ModuleLinker::getLinkage) or an alias whose base object is - // being imported as a declaration. In that case copy the attributes from the - // base object. - if (GA && !dyn_cast(NewGV)) { - assert(isPerformingImport() && !doImportAsDefinition(GA)); - NewGV->copyAttributesFrom(GA->getBaseObject()); - } else - NewGV->copyAttributesFrom(SrcGV); + NewGV->copyAttributesFrom(SrcGV); forceRenaming(NewGV, getName(SrcGV)); } @@ -651,7 +620,7 @@ bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) { return true; // Only import the function requested for importing. auto *SF = dyn_cast(SGV); - if (SF && SF == ImportFunction) + if (SF && ImportFunction->count(SF)) return true; // Otherwise no. return false; @@ -798,11 +767,12 @@ ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap, // No linking to be performed or linking from the source: simply create an // identical version of the symbol over in the dest module... the // initializer will be filled in later by LinkGlobalInits. - GlobalVariable *NewDGV = new GlobalVariable( - *DstM, TypeMap.get(SGVar->getType()->getElementType()), - SGVar->isConstant(), getLinkage(SGVar), /*init*/ nullptr, getName(SGVar), - /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), - SGVar->getType()->getAddressSpace()); + GlobalVariable *NewDGV = + new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()), + SGVar->isConstant(), GlobalValue::ExternalLinkage, + /*init*/ nullptr, getName(SGVar), + /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), + SGVar->getType()->getAddressSpace()); return NewDGV; } @@ -813,42 +783,18 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap, const Function *SF) { // If there is no linkage to be performed or we are linking from the source, // bring SF over. - return Function::Create(TypeMap.get(SF->getFunctionType()), getLinkage(SF), - getName(SF), DstM); + return Function::Create(TypeMap.get(SF->getFunctionType()), + GlobalValue::ExternalLinkage, getName(SF), &DstM); } /// Set up prototypes for any aliases that come over from the source module. GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap, const GlobalAlias *SGA) { - // If we are importing and encounter a weak_any alias, or an alias to - // an object being imported as a declaration, we must import the alias - // as a declaration as well, which involves converting it to a non-alias. - // See comments in ModuleLinker::getLinkage for why we cannot import - // weak_any defintions. - if (isPerformingImport() && !doImportAsDefinition(SGA)) { - // Need to convert to declaration. All aliases must be definitions. - const GlobalValue *GVal = SGA->getBaseObject(); - GlobalValue *NewGV; - if (auto *GVar = dyn_cast(GVal)) - NewGV = copyGlobalVariableProto(TypeMap, GVar); - else { - auto *F = dyn_cast(GVal); - assert(F); - NewGV = copyFunctionProto(TypeMap, F); - } - // Set the linkage to External or ExternalWeak (see comments in - // ModuleLinker::getLinkage for why WeakAny is converted to ExternalWeak). - if (SGA->hasWeakAnyLinkage()) - NewGV->setLinkage(GlobalValue::ExternalWeakLinkage); - else - NewGV->setLinkage(GlobalValue::ExternalLinkage); - return NewGV; - } // If there is no linkage to be performed or we're linking from the source, // bring over SGA. auto *Ty = TypeMap.get(SGA->getValueType()); return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(), - getLinkage(SGA), getName(SGA), DstM); + GlobalValue::ExternalLinkage, getName(SGA), &DstM); } static GlobalValue::VisibilityTypes @@ -876,14 +822,31 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV, GlobalValue *ModuleLinker::copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV, - const GlobalValue *DGV) { + const GlobalValue *DGV, + bool ForDefinition) { GlobalValue *NewGV; - if (auto *SGVar = dyn_cast(SGV)) + if (auto *SGVar = dyn_cast(SGV)) { NewGV = copyGlobalVariableProto(TypeMap, SGVar); - else if (auto *SF = dyn_cast(SGV)) + } else if (auto *SF = dyn_cast(SGV)) { NewGV = copyFunctionProto(TypeMap, SF); - else - NewGV = copyGlobalAliasProto(TypeMap, cast(SGV)); + } else { + if (ForDefinition) + NewGV = copyGlobalAliasProto(TypeMap, cast(SGV)); + else + NewGV = new GlobalVariable( + DstM, TypeMap.get(SGV->getType()->getElementType()), + /*isConstant*/ false, GlobalValue::ExternalLinkage, + /*init*/ nullptr, getName(SGV), + /*insertbefore*/ nullptr, SGV->getThreadLocalMode(), + SGV->getType()->getAddressSpace()); + } + + if (ForDefinition) + NewGV->setLinkage(getLinkage(SGV)); + else if (SGV->hasAvailableExternallyLinkage() || SGV->hasWeakLinkage() || + SGV->hasLinkOnceLinkage()) + NewGV->setLinkage(GlobalValue::ExternalWeakLinkage); + copyGVAttributes(NewGV, SGV); setVisibility(NewGV, SGV, DGV); return NewGV; @@ -898,22 +861,8 @@ Value *ModuleLinker::materializeDeclFor(Value *V) { if (!SGV) return nullptr; - // If we are done linking global value bodies (i.e. we are performing - // metadata linking), don't link in the global value due to this - // reference, simply map it to null. - if (doneLinkingBodies()) - return nullptr; - - GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV); - - if (Comdat *SC = SGV->getComdat()) { - if (auto *DGO = dyn_cast(DGV)) { - Comdat *DC = DstM->getOrInsertComdat(SC->getName()); - DGO->setComdat(DC); - } - } - - return DGV; + linkGlobalValueProto(SGV); + return ValueMap[SGV]; } void ValueMaterializerTy::materializeInitFor(GlobalValue *New, @@ -921,22 +870,39 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New, return ModLinker->materializeInitFor(New, Old); } +static bool shouldLazyLink(const GlobalValue &GV) { + return GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || + GV.hasAvailableExternallyLinkage(); +} + void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) { + if (auto *F = dyn_cast(New)) { + if (!F->isDeclaration()) + return; + } else if (auto *V = dyn_cast(New)) { + if (V->hasInitializer()) + return; + } else { + auto *A = cast(New); + if (A->getAliasee()) + return; + } + + if (Old->isDeclaration()) + return; + if (isPerformingImport() && !doImportAsDefinition(Old)) return; - // Skip declarations that ValueMaterializer may have created in - // case we link in only some of SrcM. - if (shouldLinkOnlyNeeded() && Old->isDeclaration()) + if (!ValuesToLink.count(Old) && !shouldLazyLink(*Old)) return; - assert(!Old->isDeclaration() && "users should not pass down decls"); - linkGlobalValueBody(*Old); + linkGlobalValueBody(*New, *Old); } -bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName, +bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName, const GlobalVariable *&GVar) { - const GlobalValue *GVal = M->getNamedValue(ComdatName); + const GlobalValue *GVal = M.getNamedValue(ComdatName); if (const auto *GA = dyn_cast_or_null(GVal)) { GVal = GA->getBaseObject(); if (!GVal) @@ -995,8 +961,8 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName, getComdatLeader(SrcM, ComdatName, SrcGV)) return true; - const DataLayout &DstDL = DstM->getDataLayout(); - const DataLayout &SrcDL = SrcM->getDataLayout(); + const DataLayout &DstDL = DstM.getDataLayout(); + const DataLayout &SrcDL = SrcM.getDataLayout(); uint64_t DstSize = DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType()); uint64_t SrcSize = @@ -1028,7 +994,7 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC, bool &LinkFromSrc) { Comdat::SelectionKind SSK = SrcC->getSelectionKind(); StringRef ComdatName = SrcC->getName(); - Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable(); + Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable(); Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName); if (DstCI == ComdatSymTab.end()) { @@ -1069,7 +1035,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, if (isa(&Src)) { // For functions, LinkFromSrc iff this is the function requested // for importing. For variables, decide below normally. - LinkFromSrc = (&Src == ImportFunction); + LinkFromSrc = ImportFunction->count(&Src); return false; } @@ -1156,7 +1122,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, /// types 'Foo' but one got renamed when the module was loaded into the same /// LLVMContext. void ModuleLinker::computeTypeMapping() { - for (GlobalValue &SGV : SrcM->globals()) { + for (GlobalValue &SGV : SrcM.globals()) { GlobalValue *DGV = getLinkedToGlobal(&SGV); if (!DGV) continue; @@ -1172,12 +1138,12 @@ void ModuleLinker::computeTypeMapping() { TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType()); } - for (GlobalValue &SGV : *SrcM) { + for (GlobalValue &SGV : SrcM) { if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); } - for (GlobalValue &SGV : SrcM->aliases()) { + for (GlobalValue &SGV : SrcM.aliases()) { if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); } @@ -1186,7 +1152,7 @@ void ModuleLinker::computeTypeMapping() { // At this point, the destination module may have a type "%foo = { i32 }" for // example. When the source module got loaded into the same LLVMContext, if // it had the same type, it would have been renamed to "%foo.42 = { i32 }". - std::vector Types = SrcM->getIdentifiedStructTypes(); + std::vector Types = SrcM.getIdentifiedStructTypes(); for (StructType *ST : Types) { if (!ST->hasName()) continue; @@ -1199,7 +1165,7 @@ void ModuleLinker::computeTypeMapping() { continue; // Check to see if the destination module has a struct with the prefix name. - StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)); + StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos)); if (!DST) continue; @@ -1273,10 +1239,10 @@ static void upgradeGlobalArray(GlobalVariable *GV) { void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) { // Look for the global arrays. - auto *DstGV = dyn_cast_or_null(DstM->getNamedValue(Name)); + auto *DstGV = dyn_cast_or_null(DstM.getNamedValue(Name)); if (!DstGV) return; - auto *SrcGV = dyn_cast_or_null(SrcM->getNamedValue(Name)); + auto *SrcGV = dyn_cast_or_null(SrcM.getNamedValue(Name)); if (!SrcGV) return; @@ -1306,6 +1272,14 @@ void ModuleLinker::upgradeMismatchedGlobals() { upgradeMismatchedGlobalArray("llvm.global_dtors"); } +static void getArrayElements(const Constant *C, + SmallVectorImpl &Dest) { + unsigned NumElements = cast(C->getType())->getNumElements(); + + for (unsigned i = 0; i != NumElements; ++i) + Dest.push_back(C->getAggregateElement(i)); +} + /// If there were any appending global variables, link them together now. /// Return true on error. bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, @@ -1314,10 +1288,8 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, cast(TypeMap.get(SrcGV->getType()->getElementType())); Type *EltTy = SrcTy->getElementType(); - uint64_t NewSize = SrcTy->getNumElements(); if (DstGV) { ArrayType *DstTy = cast(DstGV->getType()->getElementType()); - NewSize += DstTy->getNumElements(); if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) return emitError( @@ -1347,35 +1319,55 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, "Appending variables with different section name need to be linked!"); } + SmallVector DstElements; + if (DstGV) + getArrayElements(DstGV->getInitializer(), DstElements); + + SmallVector SrcElements; + getArrayElements(SrcGV->getInitializer(), SrcElements); + + StringRef Name = SrcGV->getName(); + bool IsNewStructor = + (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") && + cast(EltTy)->getNumElements() == 3; + if (IsNewStructor) + SrcElements.erase( + std::remove_if(SrcElements.begin(), SrcElements.end(), + [this](Constant *E) { + auto *Key = dyn_cast( + E->getAggregateElement(2)->stripPointerCasts()); + return Key && !ValuesToLink.count(Key) && + !shouldLazyLink(*Key); + }), + SrcElements.end()); + uint64_t NewSize = DstElements.size() + SrcElements.size(); ArrayType *NewType = ArrayType::get(EltTy, NewSize); // Create the new global variable. GlobalVariable *NG = new GlobalVariable( - *DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(), + DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(), /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(), SrcGV->getType()->getAddressSpace()); // Propagate alignment, visibility and section info. copyGVAttributes(NG, SrcGV); - AppendingVarInfo AVI; - AVI.NewGV = NG; - AVI.DstInit = DstGV ? DstGV->getInitializer() : nullptr; - AVI.SrcInit = SrcGV->getInitializer(); - AppendingVars.push_back(AVI); - // Replace any uses of the two global variables with uses of the new // global. ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType())); + for (auto *V : SrcElements) { + DstElements.push_back( + MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer)); + } + + NG->setInitializer(ConstantArray::get(NewType, DstElements)); + if (DstGV) { DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType())); DstGV->eraseFromParent(); } - // Track the source variable so we don't try to link it. - DoNotLinkFromSource.insert(SrcGV); - return false; } @@ -1384,14 +1376,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { // Handle the ultra special appending linkage case first. assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage()); - if (SGV->hasAppendingLinkage() && isPerformingImport()) { - // Don't want to append to global_ctors list, for example, when we - // are importing for ThinLTO, otherwise the global ctors and dtors - // get executed multiple times for local variables (the latter causing - // double frees). - DoNotLinkFromSource.insert(SGV); - return false; - } if (SGV->hasAppendingLinkage()) return linkAppendingVarProto(cast_or_null(DGV), cast(SGV)); @@ -1400,66 +1384,47 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { Comdat *C = nullptr; bool HasUnnamedAddr = SGV->hasUnnamedAddr(); - if (const Comdat *SC = SGV->getComdat()) { + if (isPerformingImport() && !doImportAsDefinition(SGV)) { + LinkFromSrc = false; + } else if (const Comdat *SC = SGV->getComdat()) { Comdat::SelectionKind SK; std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; - C = DstM->getOrInsertComdat(SC->getName()); + C = DstM.getOrInsertComdat(SC->getName()); C->setSelectionKind(SK); - ComdatMembers[SC].push_back(SGV); + if (SGV->hasLocalLinkage()) + LinkFromSrc = true; } else if (DGV) { if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV)) return true; } - if (!LinkFromSrc) { - // Track the source global so that we don't attempt to copy it over when - // processing global initializers. - DoNotLinkFromSource.insert(SGV); - - if (DGV) - // Make sure to remember this mapping. - ValueMap[SGV] = - ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType())); + if (!LinkFromSrc && DGV) { + // Make sure to remember this mapping. + ValueMap[SGV] = ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType())); } if (DGV) HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); - if (!LinkFromSrc && !DGV) - return false; - GlobalValue *NewGV; - if (!LinkFromSrc) { + if (!LinkFromSrc && DGV) { NewGV = DGV; // When linking from source we setVisibility from copyGlobalValueProto. setVisibility(NewGV, SGV, DGV); } else { - // If the GV is to be lazily linked, don't create it just yet. - // The ValueMaterializerTy will deal with creating it if it's used. - if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction && - (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() || - SGV->hasAvailableExternallyLinkage())) { - DoNotLinkFromSource.insert(SGV); + // If we are done linking global value bodies (i.e. we are performing + // metadata linking), don't link in the global value due to this + // reference, simply map it to null. + if (DoneLinkingBodies) return false; - } - // When we only want to link in unresolved dependencies, blacklist - // the symbol unless unless DestM has a matching declaration (DGV). - if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - - NewGV = copyGlobalValueProto(TypeMap, SGV, DGV); - - if (isPerformingImport() && !doImportAsDefinition(SGV)) - DoNotLinkFromSource.insert(SGV); + NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc); } NewGV->setUnnamedAddr(HasUnnamedAddr); if (auto *NewGO = dyn_cast(NewGV)) { - if (C) + if (C && LinkFromSrc) NewGO->setComdat(C); if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage()) @@ -1486,56 +1451,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { return false; } -static void getArrayElements(const Constant *C, - SmallVectorImpl &Dest) { - unsigned NumElements = cast(C->getType())->getNumElements(); - - for (unsigned i = 0; i != NumElements; ++i) - Dest.push_back(C->getAggregateElement(i)); -} - -void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) { - // Merge the initializer. - SmallVector DstElements; - if (AVI.DstInit) - getArrayElements(AVI.DstInit, DstElements); - - SmallVector SrcElements; - getArrayElements(AVI.SrcInit, SrcElements); - - ArrayType *NewType = cast(AVI.NewGV->getType()->getElementType()); - - StringRef Name = AVI.NewGV->getName(); - bool IsNewStructor = - (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") && - cast(NewType->getElementType())->getNumElements() == 3; - - for (auto *V : SrcElements) { - if (IsNewStructor) { - Constant *Key = V->getAggregateElement(2); - if (DoNotLinkFromSource.count(Key)) - continue; - } - DstElements.push_back( - MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer)); - } - if (DstElements.size() != NewType->getNumElements()) { - NewType = ArrayType::get(NewType->getElementType(), DstElements.size()); - GlobalVariable *Old = AVI.NewGV; - GlobalVariable *NG = new GlobalVariable( - *DstM, NewType, Old->isConstant(), Old->getLinkage(), /*init*/ nullptr, - /*name*/ "", Old, Old->getThreadLocalMode(), - Old->getType()->getAddressSpace()); - copyGVAttributes(NG, Old); - AVI.NewGV->replaceAllUsesWith( - ConstantExpr::getBitCast(NG, AVI.NewGV->getType())); - AVI.NewGV->eraseFromParent(); - AVI.NewGV = NG; - } - - AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements)); -} - /// Update the initializers in the Dest module now that all globals that may be /// referenced are in Dest. void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) { @@ -1574,7 +1489,7 @@ bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) { // Go through and convert function arguments over, remembering the mapping. Function::arg_iterator DI = Dst.arg_begin(); for (Argument &Arg : Src.args()) { - DI->setName(Arg.getName()); // Copy the name over. + DI->setName(Arg.getName()); // Copy the name over. // Add a mapping to our mapping. ValueMap[&Arg] = &*DI; @@ -1616,9 +1531,7 @@ void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) { Dst.setAliasee(Val); } -bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) { - Value *Dst = ValueMap[&Src]; - assert(Dst); +bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) { if (const Comdat *SC = Src.getComdat()) { // To ensure that we don't generate an incomplete comdat group, // we must materialize and map in any other members that are not @@ -1633,26 +1546,26 @@ bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) { } } if (shouldInternalizeLinkedSymbols()) - if (auto *DGV = dyn_cast(Dst)) + if (auto *DGV = dyn_cast(&Dst)) DGV->setLinkage(GlobalValue::InternalLinkage); if (auto *F = dyn_cast(&Src)) - return linkFunctionBody(cast(*Dst), *F); + return linkFunctionBody(cast(Dst), *F); if (auto *GVar = dyn_cast(&Src)) { - linkGlobalInit(cast(*Dst), *GVar); + linkGlobalInit(cast(Dst), *GVar); return false; } - linkAliasBody(cast(*Dst), cast(Src)); + linkAliasBody(cast(Dst), cast(Src)); return false; } /// Insert all of the named MDNodes in Src into the Dest module. void ModuleLinker::linkNamedMDNodes() { - const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata(); - for (const NamedMDNode &NMD : SrcM->named_metadata()) { + const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata(); + for (const NamedMDNode &NMD : SrcM.named_metadata()) { // Don't link module flags here. Do them separately. if (&NMD == SrcModFlags) continue; - NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(NMD.getName()); + NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName()); // Add Src elements into Dest node. for (const MDNode *op : NMD.operands()) DestNMD->addOperand(MapMetadata( @@ -1664,12 +1577,13 @@ void ModuleLinker::linkNamedMDNodes() { /// Merge the linker flags in Src into the Dest module. bool ModuleLinker::linkModuleFlagsMetadata() { // If the source module has no module flags, we are done. - const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata(); - if (!SrcModFlags) return false; + const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata(); + if (!SrcModFlags) + return false; // If the destination module doesn't have module flags yet, then just copy // over the source module's flags. - NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata(); + NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata(); if (DstModFlags->getNumOperands() == 0) { for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) DstModFlags->addOperand(SrcModFlags->getOperand(I)); @@ -1679,7 +1593,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() { // First build a map of the existing module flags and requirements. DenseMap> Flags; - SmallSetVector Requirements; + SmallSetVector Requirements; for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) { MDNode *Op = DstModFlags->getOperand(I); ConstantInt *Behavior = mdconst::extract(Op->getOperand(0)); @@ -1752,7 +1666,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() { auto replaceDstValue = [&](MDNode *New) { Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New}; - MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps); + MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps); DstModFlags->setOperand(DstIndex, Flag); Flags[ID].first = Flag; }; @@ -1760,7 +1674,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() { // Perform the merge for standard behavior types. switch (SrcBehaviorValue) { case Module::Require: - case Module::Override: llvm_unreachable("not possible"); + case Module::Override: + llvm_unreachable("not possible"); case Module::Error: { // Emit an error if the values differ. if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { @@ -1785,7 +1700,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() { MDs.append(DstValue->op_begin(), DstValue->op_end()); MDs.append(SrcValue->op_begin(), SrcValue->op_end()); - replaceDstValue(MDNode::get(DstM->getContext(), MDs)); + replaceDstValue(MDNode::get(DstM.getContext(), MDs)); break; } case Module::AppendUnique: { @@ -1795,7 +1710,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() { Elts.insert(DstValue->op_begin(), DstValue->op_end()); Elts.insert(SrcValue->op_begin(), SrcValue->op_end()); - replaceDstValue(MDNode::get(DstM->getContext(), + replaceDstValue(MDNode::get(DstM.getContext(), makeArrayRef(Elts.begin(), Elts.end()))); break; } @@ -1823,16 +1738,15 @@ bool ModuleLinker::linkModuleFlagsMetadata() { static bool triplesMatch(const Triple &T0, const Triple &T1) { // If vendor is apple, ignore the version number. if (T0.getVendor() == Triple::Apple) - return T0.getArch() == T1.getArch() && - T0.getSubArch() == T1.getSubArch() && - T0.getVendor() == T1.getVendor() && - T0.getOS() == T1.getOS(); + return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() && + T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS(); return T0 == T1; } // This function returns the merged triple. -static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) { +static std::string mergeTriples(const Triple &SrcTriple, + const Triple &DstTriple) { // If vendor is apple, pick the triple with the larger version number. if (SrcTriple.getVendor() == Triple::Apple) if (DstTriple.isOSVersionLT(SrcTriple)) @@ -1841,52 +1755,112 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple return DstTriple.str(); } -bool ModuleLinker::run() { - assert(DstM && "Null destination module"); - assert(SrcM && "Null source module"); +bool ModuleLinker::linkIfNeeded(GlobalValue &GV) { + GlobalValue *DGV = getLinkedToGlobal(&GV); + + if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) + return false; + + if (DGV && !GV.hasLocalLinkage() && !GV.hasAppendingLinkage()) { + auto *DGVar = dyn_cast(DGV); + auto *SGVar = dyn_cast(&GV); + if (DGVar && SGVar) { + if (DGVar->isDeclaration() && SGVar->isDeclaration() && + (!DGVar->isConstant() || !SGVar->isConstant())) { + DGVar->setConstant(false); + SGVar->setConstant(false); + } + if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) { + unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment()); + SGVar->setAlignment(Align); + DGVar->setAlignment(Align); + } + } + + GlobalValue::VisibilityTypes Visibility = + getMinVisibility(DGV->getVisibility(), GV.getVisibility()); + DGV->setVisibility(Visibility); + GV.setVisibility(Visibility); + + bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr(); + DGV->setUnnamedAddr(HasUnnamedAddr); + GV.setUnnamedAddr(HasUnnamedAddr); + } + + // Don't want to append to global_ctors list, for example, when we + // are importing for ThinLTO, otherwise the global ctors and dtors + // get executed multiple times for local variables (the latter causing + // double frees). + if (GV.hasAppendingLinkage() && isPerformingImport()) + return false; + + if (isPerformingImport() && !doImportAsDefinition(&GV)) + return false; + + if (!DGV && !shouldOverrideFromSrc() && + (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || + GV.hasAvailableExternallyLinkage())) + return false; + + if (const Comdat *SC = GV.getComdat()) { + bool LinkFromSrc; + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + if (LinkFromSrc) + ValuesToLink.insert(&GV); + return false; + } + + bool LinkFromSrc = true; + if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV)) + return true; + if (LinkFromSrc) + ValuesToLink.insert(&GV); + return false; +} +bool ModuleLinker::run() { // Inherit the target data from the source module if the destination module // doesn't have one already. - if (DstM->getDataLayout().isDefault()) - DstM->setDataLayout(SrcM->getDataLayout()); + if (DstM.getDataLayout().isDefault()) + DstM.setDataLayout(SrcM.getDataLayout()); - if (SrcM->getDataLayout() != DstM->getDataLayout()) { + if (SrcM.getDataLayout() != DstM.getDataLayout()) { emitWarning("Linking two modules of different data layouts: '" + - SrcM->getModuleIdentifier() + "' is '" + - SrcM->getDataLayoutStr() + "' whereas '" + - DstM->getModuleIdentifier() + "' is '" + - DstM->getDataLayoutStr() + "'\n"); + SrcM.getModuleIdentifier() + "' is '" + + SrcM.getDataLayoutStr() + "' whereas '" + + DstM.getModuleIdentifier() + "' is '" + + DstM.getDataLayoutStr() + "'\n"); } // Copy the target triple from the source to dest if the dest's is empty. - if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty()) - DstM->setTargetTriple(SrcM->getTargetTriple()); + if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty()) + DstM.setTargetTriple(SrcM.getTargetTriple()); - Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple()); + Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple()); - if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple)) + if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple)) emitWarning("Linking two modules of different target triples: " + - SrcM->getModuleIdentifier() + "' is '" + - SrcM->getTargetTriple() + "' whereas '" + - DstM->getModuleIdentifier() + "' is '" + - DstM->getTargetTriple() + "'\n"); + SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() + + "' whereas '" + DstM.getModuleIdentifier() + "' is '" + + DstM.getTargetTriple() + "'\n"); - DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple)); + DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple)); // Append the module inline asm string. - if (!SrcM->getModuleInlineAsm().empty()) { - if (DstM->getModuleInlineAsm().empty()) - DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm()); + if (!SrcM.getModuleInlineAsm().empty()) { + if (DstM.getModuleInlineAsm().empty()) + DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm()); else - DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+ - SrcM->getModuleInlineAsm()); + DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" + + SrcM.getModuleInlineAsm()); } // Loop over all of the linked values to compute type mappings. computeTypeMapping(); ComdatsChosen.clear(); - for (const auto &SMEC : SrcM->getComdatSymbolTable()) { + for (const auto &SMEC : SrcM.getComdatSymbolTable()) { const Comdat &C = SMEC.getValue(); if (ComdatsChosen.count(&C)) continue; @@ -1900,69 +1874,38 @@ bool ModuleLinker::run() { // Upgrade mismatched global arrays. upgradeMismatchedGlobals(); + for (GlobalVariable &GV : SrcM.globals()) + if (const Comdat *SC = GV.getComdat()) + ComdatMembers[SC].push_back(&GV); + + for (Function &SF : SrcM) + if (const Comdat *SC = SF.getComdat()) + ComdatMembers[SC].push_back(&SF); + + for (GlobalAlias &GA : SrcM.aliases()) + if (const Comdat *SC = GA.getComdat()) + ComdatMembers[SC].push_back(&GA); + // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). - for (GlobalVariable &GV : SrcM->globals()) - if (linkGlobalValueProto(&GV)) + for (GlobalVariable &GV : SrcM.globals()) + if (linkIfNeeded(GV)) return true; - // Link the functions together between the two modules, without doing function - // bodies... this just adds external function prototypes to the DstM - // function... We do this so that when we begin processing function bodies, - // all of the global values that may be referenced are available in our - // ValueMap. - for (Function &F :*SrcM) - if (linkGlobalValueProto(&F)) + for (Function &SF : SrcM) + if (linkIfNeeded(SF)) return true; - // If there were any aliases, link them now. - for (GlobalAlias &GA : SrcM->aliases()) - if (linkGlobalValueProto(&GA)) + for (GlobalAlias &GA : SrcM.aliases()) + if (linkIfNeeded(GA)) return true; - for (AppendingVarInfo &AppendingVar : AppendingVars) - linkAppendingVarInit(AppendingVar); - - for (const auto &Entry : DstM->getComdatSymbolTable()) { - const Comdat &C = Entry.getValue(); - if (C.getSelectionKind() == Comdat::Any) - continue; - const GlobalValue *GV = SrcM->getNamedValue(C.getName()); - if (GV) - MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); - } - - // Link in the function bodies that are defined in the source module into - // DstM. - for (Function &SF : *SrcM) { - // Skip if no body (function is external). - if (SF.isDeclaration()) - continue; - - // Skip if not linking from source. - if (DoNotLinkFromSource.count(&SF)) - continue; - - if (linkGlobalValueBody(SF)) + for (GlobalValue *GV : ValuesToLink) { + MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); + if (HasError) return true; } - // Resolve all uses of aliases with aliasees. - for (GlobalAlias &Src : SrcM->aliases()) { - if (DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - - // Update the initializers in the DstM module now that all globals that may - // be referenced are in DstM. - for (GlobalVariable &Src : SrcM->globals()) { - // Only process initialized GV's or ones not already in dest. - if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - // Note that we are done linking global value bodies. This prevents // metadata linking from creating new references. DoneLinkingBodies = true; @@ -2069,12 +2012,10 @@ bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) { return *I == Ty; } -void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { - this->Composite = M; - this->DiagnosticHandler = DiagnosticHandler; - +Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler) + : Composite(M), DiagnosticHandler(DiagnosticHandler) { TypeFinder StructTypes; - StructTypes.run(*M, true); + StructTypes.run(M, true); for (StructType *Ty : StructTypes) { if (Ty->isOpaque()) IdentifiedStructTypes.addOpaque(Ty); @@ -2083,35 +2024,21 @@ void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { } } -Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { - init(M, DiagnosticHandler); -} - -Linker::Linker(Module *M) { - init(M, [this](const DiagnosticInfo &DI) { - Composite->getContext().diagnose(DI); - }); -} - -void Linker::deleteModule() { - delete Composite; - Composite = nullptr; -} +Linker::Linker(Module &M) + : Linker(M, [this](const DiagnosticInfo &DI) { + Composite.getContext().diagnose(DI); + }) {} -bool Linker::linkInModule(Module *Src, unsigned Flags, +bool Linker::linkInModule(Module &Src, unsigned Flags, const FunctionInfoIndex *Index, - Function *FuncToImport) { + DenseSet *FunctionsToImport) { ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, - DiagnosticHandler, Flags, Index, FuncToImport); + DiagnosticHandler, Flags, Index, FunctionsToImport); bool RetCode = TheLinker.run(); - Composite->dropTriviallyDeadConstantArrays(); + Composite.dropTriviallyDeadConstantArrays(); return RetCode; } -void Linker::setModule(Module *Dst) { - init(Dst, DiagnosticHandler); -} - //===----------------------------------------------------------------------===// // LinkModules entrypoint. //===----------------------------------------------------------------------===// @@ -2121,14 +2048,14 @@ void Linker::setModule(Module *Dst) { /// true is returned and ErrorMsg (if not null) is set to indicate the problem. /// Upon failure, the Dest module could be in a modified state, and shouldn't be /// relied on to be consistent. -bool Linker::LinkModules(Module *Dest, Module *Src, +bool Linker::linkModules(Module &Dest, Module &Src, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags) { Linker L(Dest, DiagnosticHandler); return L.linkInModule(Src, Flags); } -bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Flags) { +bool Linker::linkModules(Module &Dest, Module &Src, unsigned Flags) { Linker L(Dest); return L.linkInModule(Src, Flags); } @@ -2144,8 +2071,8 @@ LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src, raw_string_ostream Stream(Message); DiagnosticPrinterRawOStream DP(Stream); - LLVMBool Result = Linker::LinkModules( - D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); }); + LLVMBool Result = Linker::linkModules( + *D, *unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); }); if (OutMessages && Result) { Stream.flush(); diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp index 52ecf9fcfbf3..21f7571eec4a 100644 --- a/lib/MC/MCMachOStreamer.cpp +++ b/lib/MC/MCMachOStreamer.cpp @@ -60,6 +60,7 @@ class MCMachOStreamer : public MCObjectStreamer { /// state management void reset() override { + CreatedADWARFSection = false; HasSectionLabel.clear(); MCObjectStreamer::reset(); } diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index 8b75457a2460..41e28698b1cc 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -259,6 +259,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { DwarfDebugInlineSection = Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG, SectionKind::getMetadata()); + DwarfCUIndexSection = + Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG, + SectionKind::getMetadata()); StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps", 0, SectionKind::getMetadata()); @@ -531,6 +534,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { DwarfAddrSection = Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec"); + // DWP Sections + DwarfCUIndexSection = + Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0); + StackMapSection = Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); @@ -713,6 +720,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, SectionKind::getMetadata(), "addr_sec"); + DwarfCUIndexSection = Ctx->getCOFFSection( + ".debug_cu_index", + COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getMetadata()); DwarfAccelNamesSection = Ctx->getCOFFSection( ".apple_names", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | diff --git a/lib/Object/FunctionIndexObjectFile.cpp b/lib/Object/FunctionIndexObjectFile.cpp index 717c56bc9018..fe111de1a9c8 100644 --- a/lib/Object/FunctionIndexObjectFile.cpp +++ b/lib/Object/FunctionIndexObjectFile.cpp @@ -86,7 +86,7 @@ bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer( ErrorOr> FunctionIndexObjectFile::create(MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule, bool IsLazy) { + bool IsLazy) { std::unique_ptr Index; ErrorOr BCOrErr = findBitcodeInMemBuffer(Object); @@ -94,7 +94,7 @@ FunctionIndexObjectFile::create(MemoryBufferRef Object, return BCOrErr.getError(); ErrorOr> IOrErr = getFunctionInfoIndex( - BCOrErr.get(), DiagnosticHandler, ExportingModule, IsLazy); + BCOrErr.get(), DiagnosticHandler, IsLazy); if (std::error_code EC = IOrErr.getError()) return EC; @@ -125,8 +125,7 @@ std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer( // index object if found, or nullptr if not. ErrorOr> llvm::getFunctionIndexForFile(StringRef Path, - DiagnosticHandlerFunction DiagnosticHandler, - const Module *ExportingModule) { + DiagnosticHandlerFunction DiagnosticHandler) { ErrorOr> FileOrErr = MemoryBuffer::getFileOrSTDIN(Path); std::error_code EC = FileOrErr.getError(); @@ -134,8 +133,7 @@ llvm::getFunctionIndexForFile(StringRef Path, return EC; MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef(); ErrorOr> ObjOrErr = - object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler, - ExportingModule); + object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler); EC = ObjOrErr.getError(); if (EC) return EC; diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp index d08ec9d73176..530be8ac044a 100644 --- a/lib/ProfileData/InstrProf.cpp +++ b/lib/ProfileData/InstrProf.cpp @@ -207,7 +207,7 @@ ValueProfData::serializeFrom(const InstrProfRecord &Record) { InstrProfRecordClosure.Record = &Record; std::unique_ptr VPD( - serializeValueProfDataFrom(&InstrProfRecordClosure, 0)); + serializeValueProfDataFrom(&InstrProfRecordClosure, nullptr)); return VPD; } diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp index e4348b19ac02..cfc968739806 100644 --- a/lib/ProfileData/InstrProfReader.cpp +++ b/lib/ProfileData/InstrProfReader.cpp @@ -441,11 +441,11 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, return DataBuffer; } -std::error_code -InstrProfReaderIndex::getRecords(StringRef FuncName, - ArrayRef &Data) { - auto Iter = Index->find(FuncName); - if (Iter == Index->end()) +template +std::error_code InstrProfReaderIndex::getRecords( + StringRef FuncName, ArrayRef &Data) { + auto Iter = HashTable->find(FuncName); + if (Iter == HashTable->end()) return instrprof_error::unknown_function; Data = (*Iter); @@ -455,9 +455,11 @@ InstrProfReaderIndex::getRecords(StringRef FuncName, return instrprof_error::success; } -std::error_code InstrProfReaderIndex::getRecords( +template +std::error_code InstrProfReaderIndex::getRecords( ArrayRef &Data) { - if (atEnd()) return instrprof_error::eof; + if (atEnd()) + return instrprof_error::eof; Data = *RecordIterator; @@ -466,25 +468,26 @@ std::error_code InstrProfReaderIndex::getRecords( return instrprof_error::success; } -void InstrProfReaderIndex::Init(const unsigned char *Buckets, - const unsigned char *const Payload, - const unsigned char *const Base, - IndexedInstrProf::HashT HashType, - uint64_t Version) { +template +InstrProfReaderIndex::InstrProfReaderIndex( + const unsigned char *Buckets, const unsigned char *const Payload, + const unsigned char *const Base, IndexedInstrProf::HashT HashType, + uint64_t Version) { FormatVersion = Version; - Index.reset(IndexType::Create(Buckets, Payload, Base, - InstrProfLookupTrait(HashType, Version))); + HashTable.reset(HashTableImpl::Create( + Buckets, Payload, Base, + typename HashTableImpl::InfoType(HashType, Version))); // Form the map of hash values to const char* keys in profiling data. std::vector> HashKeys; - for (auto Key : Index->keys()) { + for (auto Key : HashTable->keys()) { const char *KeyTableRef = StringTable.insertString(Key); HashKeys.push_back(std::make_pair(ComputeHash(HashType, Key), KeyTableRef)); } std::sort(HashKeys.begin(), HashKeys.end(), less_first()); HashKeys.erase(std::unique(HashKeys.begin(), HashKeys.end()), HashKeys.end()); // Set the hash key map for the InstrLookupTrait - Index->getInfoObj().setHashKeys(std::move(HashKeys)); - RecordIterator = Index->data_begin(); + HashTable->getInfoObj().setHashKeys(std::move(HashKeys)); + RecordIterator = HashTable->data_begin(); } bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) { @@ -532,8 +535,10 @@ std::error_code IndexedInstrProfReader::readHeader() { uint64_t HashOffset = endian::byte_swap(Header->HashOffset); // The rest of the file is an on disk hash table. - Index.Init(Start + HashOffset, Cur, Start, HashType, FormatVersion); - + InstrProfReaderIndexBase *IndexPtr = nullptr; + IndexPtr = new InstrProfReaderIndex( + Start + HashOffset, Cur, Start, HashType, FormatVersion); + Index.reset(IndexPtr); return success(); } @@ -541,7 +546,7 @@ ErrorOr IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName, uint64_t FuncHash) { ArrayRef Data; - std::error_code EC = Index.getRecords(FuncName, Data); + std::error_code EC = Index->getRecords(FuncName, Data); if (EC != instrprof_error::success) return EC; // Found it. Look for counters with the right hash. @@ -571,13 +576,13 @@ std::error_code IndexedInstrProfReader::readNextRecord( ArrayRef Data; - std::error_code EC = Index.getRecords(Data); + std::error_code EC = Index->getRecords(Data); if (EC != instrprof_error::success) return error(EC); Record = Data[RecordIndex++]; if (RecordIndex >= Data.size()) { - Index.advanceToNextKey(); + Index->advanceToNextKey(); RecordIndex = 0; } return success(); diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp index f9cc2afe3da0..78bec012eeb2 100644 --- a/lib/ProfileData/InstrProfWriter.cpp +++ b/lib/ProfileData/InstrProfWriter.cpp @@ -107,22 +107,23 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) { std::tie(Where, NewFunc) = ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord())); InstrProfRecord &Dest = Where->second; + + instrprof_error Result; if (NewFunc) { // We've never seen a function with this name and hash, add it. Dest = std::move(I); + Result = instrprof_error::success; } else { // We're updating a function we've seen before. - instrprof_error MergeResult = Dest.merge(I); - if (MergeResult != instrprof_error::success) { - return MergeResult; - } + Result = Dest.merge(I); } // We keep track of the max function count as we go for simplicity. + // Update this statistic no matter the result of the merge. if (Dest.Counts[0] > MaxFunctionCount) MaxFunctionCount = Dest.Counts[0]; - return instrprof_error::success; + return Result; } std::pair InstrProfWriter::writeImpl(raw_ostream &OS) { diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp index 3b0f6e6f06e4..771d02c0aa3c 100644 --- a/lib/Support/BranchProbability.cpp +++ b/lib/Support/BranchProbability.cpp @@ -22,11 +22,14 @@ using namespace llvm; const uint32_t BranchProbability::D; raw_ostream &BranchProbability::print(raw_ostream &OS) const { + if (isUnknown()) + return OS << "?%"; + // Get a percentage rounded to two decimal digits. This avoids // implementation-defined rounding inside printf. double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0; - OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent); - return OS; + return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, + Percent); } void BranchProbability::dump() const { print(dbgs()) << '\n'; } @@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) { } } +BranchProbability +BranchProbability::getBranchProbability(uint64_t Numerator, + uint64_t Denominator) { + assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); + // Scale down Denominator to fit in a 32-bit integer. + int Scale = 0; + while (Denominator > UINT32_MAX) { + Denominator >>= 1; + Scale++; + } + return BranchProbability(Numerator >> Scale, Denominator); +} + // If ConstD is not zero, then replace D by ConstD so that division and modulo // operations by D can be optimized, in case this function is not inlined by the // compiler. diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index 3aa55b3c8850..aa3a4235d794 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -410,6 +410,7 @@ static StringRef getArchSynonym(StringRef Arch) { .Case("v7em", "v7e-m") .Cases("v8", "v8a", "aarch64", "arm64", "v8-a") .Case("v8.1a", "v8.1-a") + .Case("v8.2a", "v8.2-a") .Default(Arch); } @@ -554,6 +555,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) { case ARM::AK_ARMV7K: case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return ARM::PK_A; } return ARM::PK_INVALID; @@ -594,6 +596,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) { return 7; case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return 8; } return 0; diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index f1f2d26b4e70..ed91c209d545 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -519,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { return Triple::ARMSubArch_v8; case ARM::AK_ARMV8_1A: return Triple::ARMSubArch_v8_1a; + case ARM::AK_ARMV8_2A: + return Triple::ARMSubArch_v8_2a; default: return Triple::NoSubArch; } diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index de98d4adf996..061cdb3da216 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -86,12 +86,11 @@ static unsigned NumRegisteredSignals = 0; static struct { struct sigaction SA; int SigNo; -} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])]; +} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)]; static void RegisterHandler(int Signal) { - assert(NumRegisteredSignals < - sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) && + assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) && "Out of space for signal handlers!"); struct sigaction NewHandler; diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc index f40ca72996a1..d109a66d7035 100644 --- a/lib/Support/Windows/Signals.inc +++ b/lib/Support/Windows/Signals.inc @@ -405,10 +405,7 @@ static void RegisterHandler() { // If we cannot load up the APIs (which would be unexpected as they should // exist on every version of Windows we support), we will bail out since // there would be nothing to report. - if (!load64BitDebugHelp()) { - assert(false && "These APIs should always be available"); - return; - } + assert(load64BitDebugHelp() && "These APIs should always be available"); if (RegisteredUnhandledExceptionFilter) { EnterCriticalSection(&CriticalSection); diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e82cdd00ba1e..0bff9b592c15 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -38,6 +38,9 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Full FP16", [FeatureFPARMv8]>; +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -88,6 +91,14 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [FeatureFPARMv8, @@ -118,6 +129,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureCRC, FeaturePerfMon]>; +// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 4311198403fa..6c868880bcac 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1974,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1988,8 +1989,6 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); @@ -2083,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { unsigned Opc; unsigned LSB, MSB; SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) return nullptr; - EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[] = { Opd0, Opd1, diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 2232e419a619..f0fb03451b2a 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4380,46 +4380,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, dl, - MVT::i64)) - : DAG.getConstant(0, dl, VT); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } + /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -4427,31 +4438,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, dl, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 752a153c0574..5eef82153e39 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -911,6 +911,25 @@ def msr_sysreg_op : Operand { let PrintMethod = "printMSRSystemRegister"; } +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0c43003975c5..881f55ebeef9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -29,6 +29,8 @@ def HasCRC : Predicate<"Subtarget->hasCRC()">, def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; @@ -382,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + // As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI; diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 9aa6ef9ab670..cf94445a885c 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -33,7 +33,7 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: - enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; + enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone}; /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily; @@ -47,6 +47,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo { bool HasCRC; bool HasPerfMon; bool HasFullFP16; + bool HasSPE; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -124,6 +125,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo { bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } bool isLittleEndian() const { return IsLittle; } diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 165843fc84c9..f0ad855ed5e6 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -100,6 +100,7 @@ class AArch64AsmParser : public MCTargetAsmParser { OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); OperandMatchResultTy tryParseFPImm(OperandVector &Operands); @@ -159,7 +160,8 @@ class AArch64Operand : public MCParsedAsmOperand { k_Prefetch, k_ShiftExtend, k_FPImm, - k_Barrier + k_Barrier, + k_PSBHint, } Kind; SMLoc StartLoc, EndLoc; @@ -227,6 +229,12 @@ class AArch64Operand : public MCParsedAsmOperand { unsigned Length; }; + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + struct ShiftExtendOp { AArch64_AM::ShiftExtendType Type; unsigned Amount; @@ -250,6 +258,7 @@ class AArch64Operand : public MCParsedAsmOperand { struct SysRegOp SysReg; struct SysCRImmOp SysCRImm; struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; struct ShiftExtendOp ShiftExtend; }; @@ -301,6 +310,9 @@ class AArch64Operand : public MCParsedAsmOperand { case k_Prefetch: Prefetch = o.Prefetch; break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -392,6 +404,16 @@ class AArch64Operand : public MCParsedAsmOperand { return Prefetch.Val; } + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + StringRef getPrefetchName() const { assert(Kind == k_Prefetch && "Invalid access!"); return StringRef(Prefetch.Data, Prefetch.Length); @@ -961,6 +983,7 @@ class AArch64Operand : public MCParsedAsmOperand { } bool isSysCR() const { return Kind == k_SysCR; } bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -1534,6 +1557,11 @@ class AArch64Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createImm(getPrefetch())); } + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -1730,6 +1758,19 @@ class AArch64Operand : public MCParsedAsmOperand { return Op; } + static std::unique_ptr CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { @@ -1803,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << ""; break; } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } case k_ShiftExtend: { OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); @@ -2069,6 +2114,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_Success; } +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. AArch64AsmParser::OperandMatchResultTy diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index d8937b57e490..480ed0d263ac 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -1144,6 +1144,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, O << '#' << prfop; } +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index a94721816d33..a767aa451c6a 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -123,6 +123,9 @@ class AArch64InstPrinter : public MCInstPrinter { void printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index f657eaab8151..78f5289ec26d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -154,6 +154,14 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings AArch64PState::PStateMapper::PStateMapper() : AArch64NamedImmMapper(PStateMappings, 0) {} +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0, {}}, {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, @@ -808,6 +816,21 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings // v8.2a registers {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, }; uint32_t diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 5a6b54bbee83..f649cb9b8a8d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -478,6 +478,21 @@ namespace AArch64PState { } +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + namespace AArch64SE { enum ShiftExtSpecifiers { Invalid = -1, @@ -1199,6 +1214,21 @@ namespace AArch64SysReg { // v8.2a registers UAO = 0xc214, // 11 000 0100 0010 100 + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + // Cyclone specific system registers CPM_IOACC_CTL_EL3 = 0xff90, }; diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 80766086e15c..a620e85101e6 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 314ef721c1fc..b677caa6c2c6 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -123,6 +123,48 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { AsmPrinter::EmitFunctionEntryLabel(); } +static bool isModuleLinkage(const GlobalValue *GV) { + switch (GV->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::CommonLinkage: + return true; + case GlobalValue::ExternalLinkage: + return false; + default: llvm_unreachable("unknown linkage type"); + } +} + +void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA || + GV->isDeclaration()) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + // Group segment variables aren't emitted in HSA. + if (AMDGPU::isGroupSegment(GV)) + return; + + AMDGPUTargetStreamer *TS = + static_cast(OutStreamer->getTargetStreamer()); + if (isModuleLinkage(GV)) { + TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); + } else { + TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); + } + + const DataLayout &DL = getDataLayout(); + OutStreamer->PushSection(); + OutStreamer->SwitchSection( + getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); + MCSymbol *GVSym = getSymbol(GV); + const Constant *C = GV->getInitializer(); + OutStreamer->EmitLabel(GVSym); + EmitGlobalConstant(DL, C); + OutStreamer->PopSection(); +} + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // The starting address of all shader programs must be 256 bytes aligned. @@ -401,6 +443,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode @@ -452,18 +499,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -524,9 +580,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 687f239ecab5..1aaef00a4dd0 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -103,6 +103,8 @@ class AMDGPUAsmPrinter : public AsmPrinter { void EmitFunctionEntryLabel() override; + void EmitGlobalVariable(const GlobalVariable *GV) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp index fa54f4a017cb..32f53edeb770 100644 --- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUHSATargetObjectFile.h" +#include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" @@ -22,6 +23,30 @@ void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, TextSection = AMDGPU::getHSATextSection(Ctx); + DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); + DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); + + RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( + const char *SectionName) const { + return cast(DataGlobalAgentSection) + ->getSectionName() + .equals(SectionName); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { + // Read-only segments can only have agent allocation. + return AMDGPU::isReadOnlySegment(GV) || + (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && + isAgentAllocationSection(GV->getSection())); +} + +bool AMDGPUHSATargetObjectFile::isProgramAllocation( + const GlobalValue *GV) const { + // The default for global segments is program allocation. + return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); } MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( @@ -31,5 +56,16 @@ MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( if (Kind.isText() && !GV->hasComdat()) return getTextSection(); + if (AMDGPU::isGlobalSegment(GV)) { + if (isAgentAllocation(GV)) + return DataGlobalAgentSection; + + if (isProgramAllocation(GV)) + return DataGlobalProgramSection; + } + + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return RodataReadonlyAgentSection; + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); } diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h index e1aca67b97c2..9ea51ec9b29e 100644 --- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h @@ -22,6 +22,15 @@ namespace llvm { class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF { +private: + MCSection *DataGlobalAgentSection; + MCSection *DataGlobalProgramSection; + MCSection *RodataReadonlyAgentSection; + + bool isAgentAllocationSection(const char *SectionName) const; + bool isAgentAllocation(const GlobalValue *GV) const; + bool isProgramAllocation(const GlobalValue *GV) const; + public: void Initialize(MCContext &Ctx, const TargetMachine &TM) override; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 04a0c1d06aff..ea7c6429b7df 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -11,6 +11,8 @@ /// \brief Defines an instruction selector for the AMDGPU target. // //===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" @@ -458,41 +460,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { @@ -1062,36 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + const SIMachineFunctionInfo *Info = MF.getInfo(); - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -1269,13 +1210,14 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast(N); SDLoc DL(N); + const MachineFunction &MF = CurDAG->getMachineFunction(); + DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), + "addrspacecast not implemented"); + CurDAG->getContext()->diagnose(NotImplemented); + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && "Can only cast to / from flat address space!"); diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 21c7da663234..7c595d5a83e4 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -13,6 +13,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), LDSSize(0), + ABIArgOffset(0), ScratchSize(0), IsKernel(true) { Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 9921630326b4..971b5179b13c 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -303,6 +303,9 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { return isAmdHsaOS() ? 0 : 36; } + unsigned getMaxNumUserSGPRs() const { + return 16; + } }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4e31c7ab4d4c..7b0445db4df2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -327,7 +327,6 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6dacc742b129..4afcc60984fc 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,3 +80,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee0a69d1e68..5a94a0ba4706 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -60,6 +60,8 @@ class AMDGPUTTIImpl : public BasicTTIImplBase { unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index f1b383017901..cdbd12092150 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index db5cebf6e42c..7359cfee7f27 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -332,6 +332,14 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned ForcedEncodingSize; + bool isSI() const { + return STI->getFeatureBits()[AMDGPU::FeatureSouthernIslands]; + } + + bool isCI() const { + return STI->getFeatureBits()[AMDGPU::FeatureSeaIslands]; + } + bool isVI() const { return getSTI().getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } @@ -357,6 +365,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ParseSectionDirectiveHSAText(); bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; bool ParseDirectiveAMDGPUHsaKernel(); + bool ParseDirectiveAMDGPUHsaModuleGlobal(); + bool ParseDirectiveAMDGPUHsaProgramGlobal(); + bool ParseSectionDirectiveHSADataGlobalAgent(); + bool ParseSectionDirectiveHSADataGlobalProgram(); + bool ParseSectionDirectiveHSARodataReadonlyAgent(); public: public: @@ -504,12 +517,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End const AsmToken Tok = Parser.getTok(); StartLoc = Tok.getLoc(); EndLoc = Tok.getEndLoc(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + StringRef RegName = Tok.getString(); RegNo = getRegForName(RegName); if (RegNo) { Parser.Lex(); - return false; + return !subtargetHasRegister(*TRI, RegNo); } // Match vgprs and sgprs @@ -562,7 +577,6 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End } } - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); int RCID = getRegClass(IsVgpr, RegWidth); if (RCID == -1) return true; @@ -957,6 +971,46 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { return false; } +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalProgramSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSARodataReadonlyAgentSection(getContext())); + return false; +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -975,14 +1029,41 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amdgpu_hsa_kernel") return ParseDirectiveAMDGPUHsaKernel(); + if (IDVal == ".amdgpu_hsa_module_global") + return ParseDirectiveAMDGPUHsaModuleGlobal(); + + if (IDVal == ".amdgpu_hsa_program_global") + return ParseDirectiveAMDGPUHsaProgramGlobal(); + + if (IDVal == ".hsadata_global_agent") + return ParseSectionDirectiveHSADataGlobalAgent(); + + if (IDVal == ".hsadata_global_program") + return ParseSectionDirectiveHSADataGlobalProgram(); + + if (IDVal == ".hsarodata_readonly_agent") + return ParseSectionDirectiveHSARodataReadonlyAgent(); + return true; } bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const { - if (!isVI()) + if (isCI()) return true; + if (isSI()) { + // No flat_scr + switch (RegNo) { + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + return false; + default: + return true; + } + } + // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that // SI/CI have. for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 7a4b5bb6d359..64c9e1882e4f 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -57,7 +57,6 @@ add_llvm_target(AMDGPUCodeGen SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp - SIPrepareScratchRegs.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index d79ffdf52a74..68b1d1ae83cc 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { InlineAsmEnd = ";#ASMEND"; //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; @@ -43,6 +36,8 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { } bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { - return SectionName == ".hsatext" || + return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" || + SectionName == ".hsadata_global_program" || + SectionName == ".hsarodata_readonly_agent" || MCAsmInfo::shouldOmitSectionDirective(SectionName); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index c876fa906a9f..b91134d2ee9b 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -231,6 +231,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } } +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// @@ -316,3 +326,21 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, getStreamer().getContext().getOrCreateSymbol(SymbolName)); Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); } + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_LOCAL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_GLOBAL); +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 30fb3014f9ee..83bb728f541c 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -32,6 +32,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + + virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; }; class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { @@ -48,6 +52,10 @@ class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { @@ -80,6 +88,10 @@ class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; } diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index a2d8fa1b0a10..6b3c81c3af74 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -8,17 +8,227 @@ //==-----------------------------------------------------------------------===// #include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +static ArrayRef getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + SIMachineFunctionInfo *MFI = MF.getInfo(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } + + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); assert((RS || !MayNeedScavengingEmergencySlot) && diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 677128d6ce0a..a9152fd8b2aa 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -21,6 +21,9 @@ class SIFrameLowering final : public AMDGPUFrameLowering { AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override {} + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 5c67bf80c175..2cb801a707e1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -492,6 +492,17 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -514,7 +525,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -552,6 +563,7 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { const Function *Fn = MF.getFunction(); @@ -618,53 +630,30 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - SIMachineFunctionInfo *MFI = MF.getInfo(); - if (Subtarget->isAmdHsaOS() && MFI->hasDispatchPtr()) { - unsigned DispatchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); - unsigned DispatchPtrRegLo = - TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned DispatchPtrRegHi = - TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 1); - CCInfo.AllocateReg(DispatchPtrRegLo); - CCInfo.AllocateReg(DispatchPtrRegHi); - MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); - } - } - if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -752,10 +741,113 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Chains.empty()) @@ -767,27 +859,11 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } @@ -1051,6 +1127,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1088,37 +1176,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -2332,15 +2419,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a8b8ad34ed9d..b9f75cd11de0 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; @@ -77,6 +80,8 @@ class SITargetLowering : public AMDGPUTargetLowering { bool MemcpyStrSrc, MachineFunction &MF) const override; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -113,10 +118,6 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 9a85a1d515fe..a3a2d8c01eb5 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -556,10 +556,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } @@ -640,10 +638,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } @@ -676,11 +672,14 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (MFI->getShaderType() == ShaderType::COMPUTE && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); @@ -872,20 +871,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(*MI) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; + + if (isVOP2(*MI)) { + const MCInstrDesc &InstrDesc = MI->getDesc(); + // For VOP2 instructions, any operand type is valid to use for src0. Make + // sure we can use the src1 as src0. + // + // We could be stricter here and only allow commuting if there is a reason + // to do so. i.e. if both operands are VGPRs there is no real benefit, + // although MachineCSE attempts to find matches by commuting. + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) + return nullptr; } if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. if (NewMI || !Src1.isImm() || - (!isVOP2(*MI) && !isVOP3(*MI))) { + (!isVOP2(*MI) && !isVOP3(*MI))) { return nullptr; } - // Be sure to copy the source modifiers to the right place. if (MachineOperand *Src0Mods = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { @@ -1721,6 +1726,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { Inst->addOperand(Op1); } +bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + RI.getPhysRegClass(Reg); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; +} + +bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (MO.isReg()) + return isLegalRegOperand(MRI, OpInfo, MO); + + // Handle non-register types that are treated like immediates. + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + return true; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -1748,21 +1788,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? - MRI.getRegClass(MO->getReg()) : - RI.getPhysRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + return isLegalRegOperand(MRI, OpInfo, *MO); } @@ -1777,6 +1803,81 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &InstrDesc = get(Opc); + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 + // we need to only have one constant bus use. + // + // Note we do not need to worry about literal constants here. They are + // disabled for the operand type for instructions because they will always + // violate the one constant bus use rule. + bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + if (HasImplicitSGPR) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + } + + // VOP2 src0 instructions support all operand types, so we don't need to check + // their legality. If src1 is already legal, we don't need to do anything. + if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) + return; + + // We do not use commuteInstruction here because it is too aggressive and will + // commute if it is possible. We only want to commute here if it improves + // legality. This can be called a fairly large number of times so don't waste + // compile time pointlessly swapping and checking legality again. + if (HasImplicitSGPR || !MI->isCommutable()) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + // If src0 can be used as src1, commuting will make the operands legal. + // Otherwise we have to give up and insert a move. + // + // TODO: Other immediate-like operand kinds could be commuted if there was a + // MachineOperand::ChangeTo* for them. + if ((!Src1.isImm() && !Src1.isReg()) || + !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int CommutedOpc = commuteOpcode(*MI); + if (CommutedOpc == -1) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + MI->setDesc(get(CommutedOpc)); + + unsigned Src0Reg = Src0.getReg(); + unsigned Src0SubReg = Src0.getSubReg(); + bool Src0Kill = Src0.isKill(); + + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isReg()) { + Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); + Src0.setSubReg(Src1.getSubReg()); + } else + llvm_unreachable("Should only have register or immediate operands"); + + Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); + Src1.setSubReg(Src0SubReg); +} + // Legalize VOP3 operands. Because all operand types are supported for any // operand, and since literal constants are not allowed and should never be // seen, we only need to worry about inserting copies if we use multiple SGPR @@ -1822,32 +1923,10 @@ void SIInstrInfo::legalizeOperandsVOP3( void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - unsigned Opc = MI->getOpcode(); // Legalize VOP2 if (isVOP2(*MI)) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) - legalizeOpWithMove(MI, Src0Idx); - - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; - - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } - - legalizeOpWithMove(MI, Src1Idx); + legalizeOperandsVOP2(MRI, MI); return; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 2bce87f3bd06..8d18d29196f7 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -381,6 +381,23 @@ class SIInstrInfo : public AMDGPUInstrInfo { bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; + /// \brief Check if \p MO would be a valid operand for the given operand + /// definition \p OpInfo. Note this does not attempt to validate constant bus + /// restrictions (e.g. literal constant usage). + bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Check if \p MO (a register operand) is a legal register for the + /// given operand description. + bool isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// copy of src1. + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + /// \brief Fix operands in \p MI to satisfy constant bus requirements. void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index bc2b0c6c07fb..2cee993d751c 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1942,36 +1942,6 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore ; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - class SI_INDIRECT_SRC : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), (ins rc:$src, VSrc_32:$idx, i32imm:$off), diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 6269dce553f6..935aad427198 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), + NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), DispatchID(false), - KernargSegmentPtr(true), + KernargSegmentPtr(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(true), WorkItemIDY(false), WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; @@ -66,6 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 377c5ce94846..9c528d63bd0e 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,10 +26,36 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: // FIXME: Make private @@ -38,12 +64,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { std::map LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; - // Feature bits required for inputs passed in user / system SGPRs. + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; bool DispatchPtr : 1; bool QueuePtr : 1; bool DispatchID : 1; @@ -53,15 +81,27 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { bool GridWorkgroupCountY : 1; bool GridWorkgroupCountZ : 1; + // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + public: struct SpilledReg { unsigned VGPR; @@ -80,6 +120,47 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + bool hasDispatchPtr() const { return DispatchPtr; } @@ -128,6 +209,10 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { return WorkGroupInfo; } + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + bool hasWorkItemIDX() const { return WorkItemIDX; } @@ -140,13 +225,37 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { return WorkItemIDZ; } + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + /// \brief Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. unsigned getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(const SIRegisterInfo *TRI); + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } bool hasSpilledSGPRs() const { return HasSpilledSGPRs; diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp deleted file mode 100644 index a6c22775e098..000000000000 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = &MF.front(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI)) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index ab7539b6fb3a..bf87f0225272 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -68,6 +102,23 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); + } + return Reserved; } @@ -188,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) .addReg(SOffset) .addImm(Offset) .addImm(0) // glc @@ -243,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; @@ -507,36 +560,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); + (void)ST; switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - if (ST.isAmdHsaOS()) - return MFI->hasDispatchPtr() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; - return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::WORKGROUP_ID_X: + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Y: + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Z: + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: + return MFI->DispatchPtrUserSGPR; + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: + case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: + case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 36f6d1c7a261..1795237c2140 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -18,6 +18,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" namespace llvm { @@ -29,6 +30,15 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, @@ -56,6 +66,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { return isSGPRClass(getRegClass(RCID)); } + bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return isSGPRClass(MRI.getRegClass(Reg)); + return getPhysRegClass(Reg); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; @@ -93,23 +109,25 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { // SGPRS: - SCRATCH_PTR = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, - INPUT_PTR = 3, - TGID_X = 10, - TGID_Y = 11, - TGID_Z = 12, - SCRATCH_WAVE_OFFSET = 14, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + // VGPRS: FIRST_VGPR_VALUE = 15, - TIDIG_X = FIRST_VGPR_VALUE, - TIDIG_Y = 16, - TIDIG_Z = 17, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 }; /// \brief Returns the physical register that \p Value is stored in. diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e70f79d5a7be..441baed9b434 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -7,6 +7,8 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/SubtargetFeature.h" @@ -66,5 +68,36 @@ MCSection *getHSATextSection(MCContext &Ctx) { ELF::SHF_AMDGPU_HSA_CODE); } +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL); +} + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +} + +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +} + +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +} + } // End namespace AMDGPU } // End namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 59a32a6b592d..7b3c858e7c36 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -15,6 +15,7 @@ namespace llvm { class FeatureBitset; +class GlobalValue; class MCContext; class MCSection; @@ -31,6 +32,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); MCSection *getHSATextSection(MCContext &Ctx); +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); + +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); + } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index ceb48d83cd84..dd33c3614b1a 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -62,6 +62,9 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP", [FeatureVFP4]>; +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision floating point", + [FeatureFPARMv8]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", "Restrict FP to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", @@ -212,6 +215,9 @@ def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [HasV8Ops]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", + [HasV8_1aOps]>; //===----------------------------------------------------------------------===// @@ -232,6 +238,8 @@ def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", []>; def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17", "Cortex-A17 ARM processors", []>; +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", []>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", []>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", @@ -354,6 +362,18 @@ def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps, FeatureCrypto, FeatureCRC]>; +def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + // Aliases def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; @@ -585,6 +605,13 @@ def : ProcNoItin<"cortex-m7", [ARMv7em, FeatureD16]>; +def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, FeatureHWDiv, FeatureHWDivARM, diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 0bf2d374df6a..e89757c19ecc 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->removeSuccessor(BB); - JTBB->addSuccessor(NewBB); + JTBB->replaceSuccessor(BB, NewBB); ++NumJTInserted; return NewBB; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 5b3229456317..c5990bb7d1fb 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1889,10 +1889,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // first in the list. MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, PostStackMBB}; - const int NbAddedBlocks = sizeof(AddedBlocks) / sizeof(AddedBlocks[0]); - for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) - BeforePrologueRegion.insert(AddedBlocks[Idx]); + for (MachineBasicBlock *B : AddedBlocks) + BeforePrologueRegion.insert(B); for (const auto &LI : PrologueMBB.liveins()) { for (MachineBasicBlock *PredBB : BeforePrologueRegion) @@ -1901,9 +1900,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Remove the newly added blocks from the list, since we know // we do not have to do the following updates for them. - for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) { - BeforePrologueRegion.erase(AddedBlocks[Idx]); - MF.insert(PrologueMBB.getIterator(), AddedBlocks[Idx]); + for (MachineBasicBlock *B : AddedBlocks) { + BeforePrologueRegion.erase(B); + MF.insert(PrologueMBB.getIterator(), B); } for (MachineBasicBlock *MBB : BeforePrologueRegion) { diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0cc41812d71c..33f74a3ba9fd 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, } } - BB->addSuccessor(DispatchBB); + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from @@ -11891,7 +11891,7 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly : AtomicExpansionKind::None; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 238dc338d141..4c7107aee6a2 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -215,6 +215,8 @@ def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -234,7 +236,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float">; + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 9e3cd36d49ef..bb6ae28065bd 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -112,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() { HasV7Ops = false; HasV8Ops = false; HasV8_1aOps = false; + HasV8_2aOps = false; HasVFPv2 = false; HasVFPv3 = false; HasVFPv4 = false; @@ -130,6 +131,7 @@ void ARMSubtarget::initializeEnvironment() { NoMovt = false; SupportsTailCall = false; HasFP16 = false; + HasFullFP16 = false; HasD16 = false; HasHardwareDivide = false; HasHardwareDivideInARM = false; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index c194149e8452..a8b28018f1b2 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -43,8 +43,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo { protected: enum ARMProcFamilyEnum { Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, - CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA53, CortexA57, - CortexA72, Krait, Swift + CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53, + CortexA57, CortexA72, Krait, Swift }; enum ARMProcClassEnum { None, AClass, RClass, MClass @@ -52,7 +52,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { enum ARMArchEnum { ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te, ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r, - ARMv7m, ARMv7em, ARMv8a, ARMv81a + ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a }; /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. @@ -77,6 +77,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool HasV7Ops; bool HasV8Ops; bool HasV8_1aOps; + bool HasV8_2aOps; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -130,10 +131,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// Thumb. bool SupportsTailCall; - /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF - /// only so far) + /// HasFP16 - True if subtarget supports half-precision FP conversions bool HasFP16; + /// HasFullFP16 - True if subtarget supports half-precision FP operations + bool HasFullFP16; + /// HasD16 - True if subtarget is limited to 16 double precision /// FP registers for VFPv3. bool HasD16; @@ -309,6 +312,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasV7Ops() const { return HasV7Ops; } bool hasV8Ops() const { return HasV8Ops; } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool isCortexA5() const { return ARMProcFamily == CortexA5; } bool isCortexA7() const { return ARMProcFamily == CortexA7; } @@ -362,6 +366,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } + bool hasFullFP16() const { return HasFullFP16; } const Triple &getTargetTriple() const { return TargetTriple; } diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ba144458386d..73f330877566 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -9916,7 +9916,7 @@ extern "C" void LLVMInitializeARMAsmParser() { // flags below, that were generated by table-gen. static const struct { const unsigned Kind; - const unsigned ArchCheck; + const uint64_t ArchCheck; const FeatureBitset Features; } Extensions[] = { { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} }, @@ -9930,6 +9930,7 @@ static const struct { { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, // FIXME: Unsupported extensions. { ARM::AEK_OS, Feature_None, {} }, { ARM::AEK_IWMMXT, Feature_None, {} }, diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 7a177f87231d..3e7da07b4aad 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -750,6 +750,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index f5d4cb8a3ca1..fd96af6cb6e0 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -406,11 +406,15 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const { if (AFI->getArgRegsSaveSize()) return true; - bool IsV4PopReturn = false; + // FIXME: this doesn't make sense, and the following patch will remove it. + if (!STI.hasV4TOps()) return false; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - return IsV4PopReturn && STI.hasV4TOps() && !STI.hasV5TOps(); + return true; + + return false; } bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, @@ -422,22 +426,42 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, const ThumbRegisterInfo *RegInfo = static_cast(STI.getRegisterInfo()); - // If MBBI is a return instruction, we may be able to directly restore + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore // LR in the PC. - // This is possible if we do not need to emit any SP update. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. // Otherwise, we need a temporary register to pop the value // and copy that value into LR. auto MBBI = MBB.getFirstTerminator(); - if (!ArgRegsSaveSize && MBBI != MBB.end() && - MBBI->getOpcode() == ARM::tBX_RET) { - if (!DoIt) + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end()) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + assert(MBB.back().getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI--; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } + + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) return true; MachineInstrBuilder MIB = AddDefaultPred( - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))) - .addReg(ARM::PC, RegState::Define); - MIB.copyImplicitOps(&*MBBI); - // erase the old tBX_RET instruction + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::LR) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). MBB.erase(MBBI); return true; } @@ -459,10 +483,10 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, if (MBBI != MBB.end()) { dl = MBBI->getDebugLoc(); auto InstUpToMBBI = MBB.end(); - // The post-decrement is on purpose here. - // We want to have the liveness right before MBBI. - while (InstUpToMBBI-- != MBBI) - UsedRegs.stepBackward(*InstUpToMBBI); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); } // Look for a register that can be directly use in the POP. @@ -508,6 +532,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, .addReg(PopReg, RegState::Kill)); } + if (MBBI == MBB.end()) { + MachineInstr& Pop = MBB.back(); + assert(Pop.getOpcode() == ARM::tPOP); + Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR)); + } + assert(PopReg && "Do not know how to get LR"); AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) .addReg(PopReg, RegState::Define); diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index b73ec0bfb191..5e78762b994a 100644 --- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -38,6 +38,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" +#include "llvm/Support/Format.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/TargetRegistry.h" @@ -1511,14 +1512,14 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, } void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) { - std::stringstream errStr; - errStr << "value " << Val << "(0x" << std::hex << Val << std::dec - << ") out of range: "; + std::string errStr; + raw_string_ostream ES(errStr); + ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: "; if (Max >= 0) - errStr << "0-" << Max; + ES << "0-" << Max; else - errStr << Max << "-" << (-Max - 1); - Error(IDLoc, errStr.str().c_str()); + ES << Max << "-" << (-Max - 1); + Error(IDLoc, ES.str().c_str()); } int HexagonAsmParser::processInstruction(MCInst &Inst, diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 4036650bf74b..1db59e1dd99d 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -471,12 +471,13 @@ extern const MCInstrDesc HexagonInsts[]; } static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, - const uint16_t Table[], size_t Size) { - if (RegNo < Size) { + ArrayRef Table) { + if (RegNo < Table.size()) { Inst.addOperand(MCOperand::createReg(Table[RegNo])); return MCDisassembler::Success; - } else - return MCDisassembler::Fail; + } + + return MCDisassembler::Fail; } static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, @@ -497,8 +498,7 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, Hexagon::R30, Hexagon::R31}; - return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable, - sizeof(IntRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable)); } static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -513,8 +513,7 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29, Hexagon::V30, Hexagon::V31}; - return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable, - sizeof(VecRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable)); } static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -526,8 +525,7 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; - return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable, - sizeof(DoubleRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable)); } static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -539,8 +537,7 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11, Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15}; - return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable, - sizeof(VecDblRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable)); } static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -549,8 +546,7 @@ static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, Hexagon::P2, Hexagon::P3}; - return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable, - sizeof(PredRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable)); } static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -559,8 +555,7 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1, Hexagon::Q2, Hexagon::Q3}; - return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable, - sizeof(VecPredRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable)); } static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -573,7 +568,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC }; - if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0])) + if (RegNo >= array_lengthof(CtrlRegDecoderTable)) return MCDisassembler::Fail; if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) @@ -597,7 +592,7 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::UPC, Hexagon::NoRegister }; - if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0])) + if (RegNo >= array_lengthof(CtrlReg64DecoderTable)) return MCDisassembler::Fail; if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) @@ -784,7 +779,7 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, // Please note that the instructions must be ordered in the descending order // of their opcode. // HexagonII::INST_ICLASS_ST -static unsigned int StoreConditionalOpcodeData[][2] = { +static const unsigned int StoreConditionalOpcodeData[][2] = { {S4_pstorerdfnew_abs, 0xafc02084}, {S4_pstorerdtnew_abs, 0xafc02080}, {S4_pstorerdf_abs, 0xafc00084}, @@ -830,18 +825,16 @@ static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000}, {S2_storerfabs, 0x48600000}, {S2_storerhabs, 0x48400000}, {S2_storerbabs, 0x48000000}}; -static int NumCondS = - sizeof(StoreConditionalOpcodeData) / sizeof(StoreConditionalOpcodeData[0]); -static int NumLS = sizeof(LoadStoreOpcodeData) / sizeof(LoadStoreOpcodeData[0]); +static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData); +static const size_t NumLS = array_lengthof(LoadStoreOpcodeData); static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { unsigned MachineOpcode = 0; unsigned LLVMOpcode = 0; - int i; if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) { - for (i = 0; i < NumCondS; ++i) { + for (size_t i = 0; i < NumCondS; ++i) { if ((insn & StoreConditionalOpcodeData[i][1]) == StoreConditionalOpcodeData[i][1]) { MachineOpcode = StoreConditionalOpcodeData[i][1]; @@ -851,7 +844,7 @@ static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { } } if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) { - for (i = 0; i < NumLS; ++i) { + for (size_t i = 0; i < NumLS; ++i) { if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) { MachineOpcode = LoadStoreOpcodeData[i][1]; LLVMOpcode = LoadStoreOpcodeData[i][0]; diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h index d360be2aa5b2..ed7d9578902e 100644 --- a/lib/Target/Hexagon/Hexagon.h +++ b/lib/Target/Hexagon/Hexagon.h @@ -47,15 +47,8 @@ #include "llvm/Target/TargetMachine.h" namespace llvm { - class MachineInstr; - class MCInst; - class MCInstrInfo; - class HexagonAsmPrinter; class HexagonTargetMachine; - void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI, - HexagonAsmPrinter &AP); - /// \brief Creates a Hexagon-specific Target Transformation Info pass. ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM); } // end namespace llvm; diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 530dab859d5f..19769258ee89 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -56,6 +56,11 @@ using namespace llvm; +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + #define DEBUG_TYPE "asm-printer" static cl::opt AlignCalls( @@ -179,6 +184,7 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, /// void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst MCB = HexagonMCInstrInfo::createBundle(); + const MCInstrInfo &MCII = *Subtarget->getInstrInfo(); if (MI->isBundle()) { const MachineBasicBlock* MBB = MI->getParent(); @@ -190,25 +196,23 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) { MII->getOpcode() == TargetOpcode::IMPLICIT_DEF) ++IgnoreCount; else { - HexagonLowerToMC(&*MII, MCB, *this); + HexagonLowerToMC(MCII, &*MII, MCB, *this); } } } else { - HexagonLowerToMC(MI, MCB, *this); + HexagonLowerToMC(MCII, MI, MCB, *this); HexagonMCInstrInfo::padEndloop(OutStreamer->getContext(), MCB); } // Examine the packet and try to find instructions that can be converted // to compounds. - HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(), - OutStreamer->getContext(), MCB); + HexagonMCInstrInfo::tryCompound(MCII, OutStreamer->getContext(), MCB); // Examine the packet and convert pairs of instructions to duplex // instructions when possible. SmallVector possibleDuplexes; - possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties( - *Subtarget->getInstrInfo(), MCB); - HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget, - OutStreamer->getContext(), MCB, possibleDuplexes); + possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB); + HexagonMCShuffle(MCII, *Subtarget, OutStreamer->getContext(), MCB, + possibleDuplexes); EmitToStreamer(*OutStreamer, MCB); } diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 96bb61750805..efafdd007289 100644 --- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->removeSuccessor(JumpAroundTarget); - MBB->addSuccessor(UncondTarget); + MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->removeSuccessor(UncondTarget); - LayoutSucc->addSuccessor(JumpAroundTarget); + LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 969edf6d5572..04f5b6649293 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2863,7 +2863,7 @@ TargetLowering::AtomicExpansionKind HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // Do not expand loads and stores that don't exceed 64 bits. return LI->getType()->getPrimitiveSizeInBits() > 64 - ? AtomicExpansionKind::LLSC + ? AtomicExpansionKind::LLOnly : AtomicExpansionKind::None; } diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td index 65612c590bfe..7389a40f4a4c 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -3864,26 +3864,6 @@ let AddedComplexity = 100 in { def: Stoream_pat; } -// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd -let AddedComplexity = 100 in -def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))), - (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>; - -// Transfer global address into a register -let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1, -isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in -def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1), - "$dst = #$src1", - [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>; - -// Transfer a block address into a register -def : Pat<(HexagonCONST32_GP tblockaddress:$src1), - (TFRI_V4 tblockaddress:$src1)>; - -let AddedComplexity = 50 in -def : Pat<(HexagonCONST32_GP tglobaladdr:$src1), - (TFRI_V4 tglobaladdr:$src1)>; - // i8/i16/i32 -> i64 loads // We need a complexity of 120 here to override preceding handling of // zextload. diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp index 82a9b23149c4..86d9e19c0547 100644 --- a/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -26,39 +26,71 @@ using namespace llvm; -static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol, - HexagonAsmPrinter& Printer) { +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + HexagonAsmPrinter &Printer) { MCContext &MC = Printer.OutContext; const MCExpr *ME; - ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC); + // Populate the relocation type based on Hexagon target flags + // set on an operand + MCSymbolRefExpr::VariantKind RelocationType; + switch (MO.getTargetFlags()) { + default: + RelocationType = MCSymbolRefExpr::VK_None; + break; + case HexagonII::MO_PCREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL; + break; + case HexagonII::MO_GOT: + RelocationType = MCSymbolRefExpr::VK_GOT; + break; + case HexagonII::MO_LO16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16; + break; + case HexagonII::MO_HI16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16; + break; + case HexagonII::MO_GPREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL; + break; + } + + ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC); if (!MO.isJTI() && MO.getOffset()) ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC), MC); - return (MCOperand::createExpr(ME)); + return MCOperand::createExpr(ME); } // Create an MCInst from a MachineInstr -void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, - HexagonAsmPrinter& AP) { - if(MI->getOpcode() == Hexagon::ENDLOOP0){ +void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP) { + if (MI->getOpcode() == Hexagon::ENDLOOP0) { HexagonMCInstrInfo::setInnerLoop(MCB); return; } - if(MI->getOpcode() == Hexagon::ENDLOOP1){ + if (MI->getOpcode() == Hexagon::ENDLOOP1) { HexagonMCInstrInfo::setOuterLoop(MCB); return; } - MCInst* MCI = new (AP.OutContext) MCInst; + MCInst *MCI = new (AP.OutContext) MCInst; MCI->setOpcode(MI->getOpcode()); assert(MCI->getOpcode() == static_cast(MI->getOpcode()) && "MCI opcode should have been set on construction"); + bool MustExtend = false; for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) { const MachineOperand &MO = MI->getOperand(i); MCOperand MCO; + if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended) + MustExtend = true; switch (MO.getType()) { default: @@ -107,5 +139,7 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, MCI->addOperand(MCO); } + HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI, + MustExtend); MCB.addOperand(MCOperand::createInst(MCI)); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 2f3521bfd717..b73af8249cb5 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -13,12 +13,12 @@ #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index 34817cd98f2f..e6194f61a6ba 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -81,8 +81,7 @@ static const std::pair opcodeData[] = { std::make_pair((unsigned)V4_SS2_storewi1, 4352)}; static std::map - subinstOpcodeMap(opcodeData, - opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0])); + subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData)); bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) { switch (Ga) { diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index f9601839b44c..716a96e2c46b 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, uint64_t Address, const void *Decoder); +// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is +// shifted left by 1 bit. +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder); + // DecodeJumpTargetMM - Decode microMIPS jump target, which is // shifted left by 1 bit. static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, @@ -1863,6 +1870,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder) { + int32_t BranchOffset = SignExtend32<26>(Offset) << 1; + + Inst.addOperand(MCOperand::createImm(BranchOffset)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn, uint64_t Address, diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 86a5d5882184..ed917a4daba3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -350,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo, return 0; } +/// getBranchTarget26OpValueMM - Return binary encoding of the branch +/// target operand. If the machine operand requires relocation, +/// record the relocation and return zero. +unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM( + const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand &MO = MI.getOperand(OpNo); + + // If the destination is an immediate, divide by 2. + if (MO.isImm()) + return MO.getImm() >> 1; + + // TODO: Push 26 PC fixup. + return 0; +} + /// getJumpOffset16OpValue - Return binary encoding of the jump /// target operand. If the machine operand requires relocation, /// record the relocation and return zero. diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index c2f4b6a72bbf..eb48914b0649 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -137,6 +137,13 @@ class MipsMCCodeEmitter : public MCCodeEmitter { SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + // getBranchTarget26OpValueMM - Return binary encoding of the branch + // offset operand. If the machine operand requires relocation, + // record the relocation and return zero. + unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + // getJumpOffset16OpValue - Return binary encoding of the jump // offset operand. If the machine operand requires relocation, // record the relocation and return zero. diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td index c4cdb0c2fadd..349b3b88a07a 100644 --- a/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -796,3 +796,65 @@ class POOL32A_WRPGPR_WSBH_FM_MMR6 funct> : MipsR6Inst { let Inst{15-6} = funct; let Inst{5-0} = 0x3c; } + +class POOL32F_RECIP_ROUND_FM_MMR6 fmt, bits<8> funct> + : MMR6Arch, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_RINT_FM_MMR6 fmt> + : MMR6Arch, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0; + let Inst{10-9} = fmt; + let Inst{8-0} = 0b000100000; +} + +class POOL32F_SEL_FM_MMR6 fmt, bits<9> funct> + : MMR6Arch, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CLASS_FM_MMR6 fmt, bits<9> funct> + : MMR6Arch, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0b00000; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td index cabaa53b2b1b..8c744d8924bb 100644 --- a/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -11,6 +11,13 @@ // //===----------------------------------------------------------------------===// +def brtarget26_mm : Operand { + let EncoderMethod = "getBranchTarget26OpValueMM"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeBranchTarget26MM"; + let ParserMatchClass = MipsJumpTargetAsmOperand; +} + //===----------------------------------------------------------------------===// // // Instruction Encodings @@ -125,6 +132,26 @@ class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>; class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>; class LW_MMR6_ENC : LOAD_WORD_FM_MMR6; class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6; +class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>; +class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>; +class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>; +class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>; +class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0, + 0b11001100>; +class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1, + 0b11001100>; +class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0, + 0b11101100>; +class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1, + 0b11101100>; +class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>; +class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>; +class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>; +class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>; +class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>; +class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>; +class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>; +class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>; class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6; class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6; @@ -238,11 +265,11 @@ class BC_MMR6_DESC_BASE bit isBarrier = 1; } -class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> { +class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> { bit isCall = 1; list Defs = [RA]; } -class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>; +class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>; class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), !strconcat("bc16", "\t$offset"), [], @@ -717,6 +744,33 @@ class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd, FGR32Opnd, II_TRUNC>; class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>; +class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd, + II_ROUND>; +class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; +class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; + +class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>; +class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> { + // We must insert a SUBREG_TO_REG around $fd_in + bit usesCustomInserter = 1; +} + +class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>; +class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>; +class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>; +class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>; +class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>; +class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>; +class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>; +class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>; class STORE_MMR6_DESC_BASE : Store, MMR6Arch { @@ -1114,6 +1168,34 @@ def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC, ISA_MICROMIPS32R6; def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC, ISA_MICROMIPS32R6; +def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6; +def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6; +def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6; +def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6; +def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC, + ISA_MICROMIPS32R6; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td index 65c8303f25fe..f24f80282b5e 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td +++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -141,3 +141,74 @@ class POOL32A_1RIMM5AC_FMT funct> : MMDSPInst { let Inst{13-6} = funct; let Inst{5-0} = 0b111100; } + +class POOL32A_2RSA5_FMT op> : MMDSPInst { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10-0} = op; +} + +class POOL32A_1RMEMB0_FMT funct> : MMDSPInst { + bits<5> index; + bits<5> base; + bits<5> rd; + + let Inst{31-26} = 0; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = funct; +} + +class POOL32A_1RAC_FMT funct> : MMDSPInst { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RMASK7_FMT op> : MMDSPInst { + bits<5> rt; + bits<7> mask; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-14} = mask; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RIMM10_FMT op> : MMDSPInst { + bits<5> rd; + bits<10> imm; + + let Inst{31-26} = 0; + let Inst{25-16} = imm; + let Inst{15-11} = rd; + let Inst{10} = 0; + let Inst{9-0} = op; +} + +class POOL32A_1RIMM8_FMT op> : MMDSPInst { + bits<5> rt; + bits<8> imm; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-13} = imm; + let Inst{12} = 0; + let Inst{11-6} = op; + let Inst{5-0} = 0b111100; +} diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td index b2e5ec61c8b4..9b4fb6853180 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td +++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -120,6 +120,35 @@ class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>; class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>; class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>; class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>; +class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>; +class PRECR_SRA_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>; +class PRECR_SRA_R_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>; +class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>; +class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>; +class PRECRQU_S_QB_PH_MM_ENC + : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>; +class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>; +class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>; +class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>; +class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>; +class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>; +class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>; +class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>; +class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>; +class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>; +class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>; +class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>; +class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>; +class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>; +class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>; +class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>; +class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>; +class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>; +class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>; +class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>; +class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>; // Instruction desc. class ABSQ_S_PH_MM_R2_DESC_BASE, Uses<[DSPPos]>, Defs<[DSPEFI]>; @@ -278,7 +308,52 @@ class EXTRV_S_H_MM_DESC : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>, Defs<[DSPOutFlag23]>; -// Instruction defs. +class MFHI_MM_DESC_BASE { + dag OutOperandList = (outs GPR32Opnd:$rs); + dag InOperandList = (ins RO:$ac); + string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); + list Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))]; + InstrItinClass Itinerary = itin; +} + +class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, + NoItinerary>; +class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, + NoItinerary>; + +class RADDU_W_QB_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins DSPROpnd:$rs); + string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs"); + list Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))]; + InstrItinClass Itinerary = NoItinerary; + string BaseOpcode = "raddu.w.qb"; +} + +class RDDSP_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins uimm16:$mask); + string AsmString = !strconcat("rddsp", "\t$rt, $mask"); + list Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPL_QB_MM_DESC { + dag OutOperandList = (outs DSPROpnd:$rt); + dag InOperandList = (ins uimm16:$imm); + string AsmString = !strconcat("repl.qb", "\t$rt, $imm"); + list Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph, + NoItinerary, DSPROpnd, + GPR32Opnd>; +class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb, + NoItinerary, DSPROpnd, + GPR32Opnd>; + // microMIPS DSP Rev 1 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC; def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC; @@ -354,6 +429,28 @@ def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC; def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC; def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC; def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC; +def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC; +def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC; +def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC; +def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC; +def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC; +def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC; +def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC; +def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC; +def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC; +def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC; +def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC; +def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC; +def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC; +def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC; +def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC; +def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC; +def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC; +def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC; // microMIPS DSP Rev 2 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC, ISA_DSPR2; @@ -398,3 +495,10 @@ def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2; def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2; def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2; def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2; +def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC, + ISA_DSPR2; +def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC, + PRECR_SRA_PH_W_DESC, ISA_DSPR2; +def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC, + PRECR_SRA_R_PH_W_DESC, ISA_DSPR2; +def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2; diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td index 120a841c3d9d..756e6c92c1d1 100644 --- a/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/lib/Target/Mips/MicroMipsInstrFPU.td @@ -43,7 +43,7 @@ def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>, BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6; def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ROUND_W_FM_MM<0, 0x24>; -def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ROUND_W_FM_MM<0, 0xec>; def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>, @@ -52,7 +52,7 @@ def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ROUND_W_FM_MM<1, 0x24>; def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>, ROUND_W_FM_MM<1, 0x2c>; -def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, +def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, ROUND_W_FM_MM<1, 0xec>; def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>, ROUND_W_FM_MM<1, 0xac>; diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index 9dd4d1e034e9..c36a45acbf79 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -687,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6; def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6; def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6; def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6; -def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6; def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6; defm S : CMP_CC_M; @@ -707,14 +709,14 @@ def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; let AdditionalPredicates = [NotInMicroMips] in { def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; } def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6; def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6; @@ -728,21 +730,27 @@ def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6; def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6; def NAL; // BAL with rd=0 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6; -def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; let AdditionalPredicates = [NotInMicroMips] in { def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; } def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; -def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32; -def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td index 9b4b9d178183..f696a38ac0f0 100644 --- a/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/lib/Target/Mips/MipsDSPInstrInfo.td @@ -275,6 +275,7 @@ class RADDU_W_QB_DESC_BASE Pattern = [(set ROD:$rd, (OpNode ROS:$rs))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class CMP_EQ_QB_R2_DESC_BASE Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class PRECR_SRA_PH_W_DESC_BASE Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class ABSQ_S_PH_R2_DESC_BASE Pattern = [(set RO:$rd, (OpNode immPat:$imm))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHLL_QB_R3_DESC_BASE Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))]; InstrItinClass Itinerary = itin; bit mayLoad = 1; + string BaseOpcode = instr_asm; } class ADDUH_QB_DESC_BASE { list Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class RDDSP_DESC_BASE Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class WRDSP_DESC_BASE Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class MTHI_DESC_BASE { @@ -492,6 +501,7 @@ class MTHI_DESC_BASE dag InOperandList = (ins GPR32Opnd:$rs); string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class BPOSGE32_PSEUDO_DESC_BASE : @@ -1102,13 +1112,13 @@ def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC; def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC; def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC; def MODSUB : MODSUB_ENC, MODSUB_DESC; -def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC; +def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC; def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC; def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC; -def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; -def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; -def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; -def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; +def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; +def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; +def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC; def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC; def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC; @@ -1141,14 +1151,14 @@ def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC; -def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; -def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; -def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; -def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; -def MFHI_DSP : MFHI_ENC, MFHI_DESC; -def MFLO_DSP : MFLO_ENC, MFLO_DESC; -def MTHI_DSP : MTHI_ENC, MTHI_DESC; -def MTLO_DSP : MTLO_ENC, MTLO_DESC; +def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; +def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC; +def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC; +def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC; +def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC; def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; @@ -1174,15 +1184,15 @@ def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC; def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC; def BITREV : BITREV_ENC, BITREV_DESC; def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC; -def REPL_QB : REPL_QB_ENC, REPL_QB_DESC; -def REPL_PH : REPL_PH_ENC, REPL_PH_DESC; -def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC; -def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC; +def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC; +def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC; +def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC; +def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC; def PICK_QB : PICK_QB_ENC, PICK_QB_DESC; def PICK_PH : PICK_PH_ENC, PICK_PH_DESC; -def LWX : LWX_ENC, LWX_DESC; -def LHX : LHX_ENC, LHX_DESC; -def LBUX : LBUX_ENC, LBUX_DESC; +def LWX : DspMMRel, LWX_ENC, LWX_DESC; +def LHX : DspMMRel, LHX_ENC, LHX_DESC; +def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC; def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC; def INSV : DspMMRel, INSV_ENC, INSV_DESC; def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC; @@ -1199,8 +1209,8 @@ def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC; def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC; def SHILO : SHILO_ENC, SHILO_DESC; def SHILOV : SHILOV_ENC, SHILOV_DESC; -def MTHLIP : MTHLIP_ENC, MTHLIP_DESC; -def RDDSP : RDDSP_ENC, RDDSP_DESC; +def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC; +def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC; def WRDSP : WRDSP_ENC, WRDSP_DESC; // MIPS DSP Rev 2 @@ -1240,9 +1250,9 @@ def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC; def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC; def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC; def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC; -def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC; -def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC; -def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC; +def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC; +def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC; +def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC; def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC; def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC; def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC; @@ -1251,7 +1261,7 @@ def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC; def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC; def APPEND : APPEND_ENC, APPEND_DESC; def BALIGN : BALIGN_ENC, BALIGN_DESC; -def PREPEND : PREPEND_ENC, PREPEND_DESC; +def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC; } diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index d1a724944335..377260f89d10 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -136,7 +136,7 @@ multiclass ABSS_M { def _D32 : MMRel, ABSS_FT, FGR_32; - def _D64 : ABSS_FT, FGR_64 { + def _D64 : StdMMR6Rel, ABSS_FT, FGR_64 { let DecoderNamespace = "Mips64"; } } @@ -267,31 +267,29 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, //===----------------------------------------------------------------------===// // Floating Point Instructions //===----------------------------------------------------------------------===// -def ROUND_W_S : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0xc, 16>, ISA_MIPS2; -let AdditionalPredicates = [NotInMicroMips] in { +defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0xd, 16>, ISA_MIPS2; def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, ABSS_FM<0xe, 16>, ISA_MIPS2; def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, ABSS_FM<0xf, 16>, ISA_MIPS2; -} def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x24, 16>; -defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2; defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2; defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2; defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>; let DecoderNamespace = "Mips64" in { + let AdditionalPredicates = [NotInMicroMips] in { def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0x8, 16>, FGR_64; def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>, ABSS_FM<0x8, 17>, FGR_64; - let AdditionalPredicates = [NotInMicroMips] in { def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0x9, 16>, FGR_64; def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>, diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index d09843ed0e53..e75858a181e5 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->removeSuccessor(TgtMBB); - MBB->addSuccessor(LongBrMBB); + MBB->replaceSuccessor(TgtMBB, LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c7a3bbd3762a..174deb88bc5c 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -569,14 +569,21 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, if ((UseAtEnd && MBB->isReturnBlock()) || (!UseAtEnd && (&MBB->getParent()->front() == MBB))) return true; - - RS.initRegState(); + RS.enterBasicBlock(MBB); - // The scratch register will be used at the end of the block, so must consider - // all registers used within the block - if (UseAtEnd && MBB->begin() != MBB->getFirstTerminator()) - RS.forward(MBB->getFirstTerminator()); + if (UseAtEnd && !MBB->empty()) { + // The scratch register will be used at the end of the block, so must consider + // all registers used within the block + + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + // If no terminator, back iterator up to previous instruction. + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } if (!RS.isRegUsed(R0)) return true; @@ -1768,6 +1775,6 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, } bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { - // FIXME: Enable this for non-Darwin PPC64 once it is confirmed working. - return false; + return (MF.getSubtarget().isSVR4ABI() && + MF.getSubtarget().isPPC64()); } diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 438a2980f2ce..2261b71c5aa9 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -2780,7 +2780,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64)) { ShuffleVectorSDNode *SVN = cast(N); - + SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); unsigned DM[2]; @@ -3106,7 +3106,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { if (!CurDAG->MaskedValueIsZero(Op0, APInt::getHighBitsSet(Bits, Bits - (b+1)*8))) return false; - + LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); return true; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 1b1e0cf57865..176a8b3ea59b 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -329,6 +329,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -477,6 +479,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -998,6 +1002,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; @@ -5808,6 +5813,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain, return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } +SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET( + SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { + SDLoc dl(Op); + + // Get the corect type for integers. + EVT IntVT = Op.getValueType(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNAREAOFFSET node. + SDValue Ops[2] = {Chain, FPSIdx}; + SDVTList VTs = DAG.getVTList(IntVT); + return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); +} + SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { // When we pop the dynamic allocation we need to restore the SP link. @@ -7938,6 +7959,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 415c47c286e3..c0aafbac1aa0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -79,6 +79,11 @@ namespace llvm { /// compute an allocation on the stack. DYNALLOC, + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr /// at function entry, used for PIC code. GlobalBaseReg, @@ -728,6 +733,8 @@ namespace llvm { const PPCSubtarget &Subtarget) const; SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; + SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index d62833037db5..075e093e41a1 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", [(set i64:$result, (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", + [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; let Defs = [LR8] in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index cc1af1a7132f..6c4364aad331 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, // Instructions to support dynamic alloca. def SDTDynOp : SDTypeProfile<1, 2, []>; +def SDTDynAreaOp : SDTypeProfile<1, 1, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; +def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", [(set i32:$result, (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", + [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 6d53f876c062..934bdf622418 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -430,6 +430,27 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { MBB.erase(II); } +void PPCRegisterInfo::lowerDynamicAreaOffset( + MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + DebugLoc dl = MI.getDebugLoc(); + BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + .addImm(maxCallFrameSize); + MBB.erase(II); +} + /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of /// reserving a whole register (R0), we scrounge for one here. This generates /// code like this: @@ -754,6 +775,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Get the instruction opcode. unsigned OpC = MI.getOpcode(); + if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) { + lowerDynamicAreaOffset(II); + return; + } + // Special case for dynamic alloca. if (FPSI && FrameIndex == FPSI && (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index 1b1e160d836c..b15fde83c9f3 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -101,6 +101,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { } void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const; void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerCRRestore(MachineBasicBlock::iterator II, diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp index 38bff44e7542..c689b7f7201e 100644 --- a/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { continue; } + // TODO: If we ever want to support v7, this needs to be extended + // to cover all floating point operations. if (!Subtarget->isV9() && (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD || MI->getOpcode() == SP::FCMPQ)) { diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 728910422166..110316ba57b2 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -162,9 +162,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { SmallPtrSet Visited; SmallVector Stack; - MachineBasicBlock *Entry = &*MF.begin(); - Visited.insert(Entry); - Stack.push_back(POStackEntry(Entry, MF, MLI)); + MachineBasicBlock *EntryBlock = &*MF.begin(); + Visited.insert(EntryBlock); + Stack.push_back(POStackEntry(EntryBlock, MF, MLI)); for (;;) { POStackEntry &Entry = Stack.back(); @@ -220,7 +220,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { #endif } -static int GetLoopDepth(const MachineLoop *Loop) { +static unsigned GetLoopDepth(const MachineLoop *Loop) { return Loop ? Loop->getLoopDepth() : 0; } @@ -249,12 +249,12 @@ static void PlaceBlockMarkers(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos; MachineLoop *HeaderLoop = MLI.getLoopFor(Header); - int MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB)); - int HeaderLoopDepth = GetLoopDepth(HeaderLoop); + unsigned MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB)); + unsigned HeaderLoopDepth = GetLoopDepth(HeaderLoop); if (HeaderLoopDepth > MBBLoopDepth) { // The nearest common dominating point is more deeply nested. Insert the // BLOCK just above the LOOP. - for (int i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i) + for (unsigned i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i) HeaderLoop = HeaderLoop->getParentLoop(); Header = HeaderLoop->getHeader(); InsertPos = Header->begin(); @@ -341,7 +341,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { PlaceMarkers(MF, MLI, TII, MDT); #ifndef NDEBUG - // Verify that block and loop beginnings and endings are in FIFO order, and + // Verify that block and loop beginnings and endings are in LIFO order, and // that all references to blocks are to blocks on the stack at the point of // the reference. SmallVector, 0> Stack; diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 8f1a06d46305..2485df1ab5d2 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -353,12 +353,10 @@ SDValue WebAssemblyTargetLowering::LowerReturn( // Record the number and types of the return values. for (const ISD::OutputArg &Out : Outs) { - if (Out.Flags.isByVal()) - fail(DL, DAG, "WebAssembly hasn't implemented byval results"); + assert(!Out.Flags.isByVal() && "byval is not valid for return values"); + assert(!Out.Flags.isNest() && "nest is not valid for return values"); if (Out.Flags.isInAlloca()) fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); - if (Out.Flags.isNest()) - fail(DL, DAG, "WebAssembly hasn't implemented nest results"); if (Out.Flags.isInConsecutiveRegs()) fail(DL, DAG, "WebAssembly hasn't implemented cons regs results"); if (Out.Flags.isInConsecutiveRegsLast()) diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 4760f0d576e4..62c5f33cfad7 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -37,7 +37,7 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo { /// determined or made to meet the stack requirements: /// - single use (per path) /// - single def (per path) - /// - defined and used in FIFO order with other stack registers + /// - defined and used in LIFO order with other stack registers BitVector VRegStackified; public: diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index c3847dd9fcb4..bdccc8577c5e 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -117,7 +117,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { break; // Iterate through the inputs in reverse order, since we'll be pulling - // operands off the stack in FIFO order. + // operands off the stack in LIFO order. bool AnyStackified = false; for (MachineOperand &Op : reverse(Insert->uses())) { // We're only interested in explicit virtual register operands. diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 4057ff7a9b43..80a83fa76b57 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -40,7 +40,7 @@ def F32_0 : WebAssemblyReg<"%f32.0">; def F64_0 : WebAssemblyReg<"%f64.0">; // The expression stack "register". This is an opaque entity which serves to -// order uses and defs that must remain in FIFO order. +// order uses and defs that must remain in LIFO order. def EXPR_STACK : WebAssemblyReg<"STACK">; // The incoming arguments "register". This is an opaque entity which serves to diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp index d0735b84de60..3a7f50e3b142 100644 --- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp +++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp @@ -69,7 +69,8 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) { const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineDominatorTree &MDT = getAnalysis(); - for (auto &MBB : MF) + for (auto &MBB : MF) { + DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n'); for (auto &MI : MBB) switch (MI.getOpcode()) { default: @@ -94,9 +95,12 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) { ->getFirstTerminator(); if (&MI == Where || !MDT.dominates(&MI, Where)) continue; + DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << + " from " << MI <<"\n"); O.setReg(ToReg); } } + } return true; } diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 6015d3465de3..83a62b731b54 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -19,7 +19,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index f4f7f0cf33b2..682f75c7f51c 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -18,7 +18,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -204,8 +204,12 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -/// Check whether or not the terminators of \p MBB needs to read EFLAGS. -static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { +/// Check if the flags need to be preserved before the terminators. +/// This would be the case, if the eflags is live-in of the region +/// composed by the terminators or live-out of that region, without +/// being defined by a terminator. +static bool +flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { @@ -215,15 +219,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { if (Reg != X86::EFLAGS) continue; - // This terminator needs an eflag that is not defined - // by a previous terminator. + // This terminator needs an eflags that is not defined + // by a previous another terminator: + // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; + // This terminator defines the eflags, i.e., we don't need to preserve it. + // However, we still need to check this specific terminator does not + // read a live-in value. BreakNext = true; } + // We found a definition of the eflags, no need to preserve them. if (BreakNext) - break; + return false; } + + // None of the terminators use or define the eflags. + // Check if they are live-out, that would imply we need to preserve them. + for (const MachineBasicBlock *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + return true; + return false; } @@ -306,7 +322,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // is tricky. bool UseLEA; if (!InEpilogue) { - UseLEA = STI.useLeaForSP(); + // Check if inserting the prologue at the beginning + // of MBB would require to use LEA operations. + // We need to use LEA operations if EFLAGS is live in, because + // it means an instruction will read it before it gets defined. + UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert @@ -315,10 +335,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) - UseLEA = terminatorsNeedFlagsAsInput(MBB); + UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. - assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) && + assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } @@ -2566,10 +2586,10 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which - // clobbers the EFLAGS. Check that none of the terminators reads the - // EFLAGS, and if one uses it, conservatively assume this is not + // clobbers the EFLAGS. Check that we do not need to preserve it, + // otherwise, conservatively assume this is not // safe to insert the epilogue here. - return !terminatorsNeedFlagsAsInput(MBB); + return !flagsNeedToBePreservedBeforeTheTerminators(MBB); } MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fa8c9cb136f5..17573733b3ec 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26,7 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -1145,7 +1145,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -4301,7 +4301,7 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, } /// Returns a vector of specified type with all zero elements. -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -6428,7 +6428,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -16525,6 +16525,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -16800,7 +16818,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; @@ -20415,7 +20433,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -22480,7 +22498,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -23363,7 +23381,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -24838,7 +24856,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -25755,8 +25773,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -25837,8 +25855,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -25870,8 +25888,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -26299,24 +26318,40 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } // If we're negating a FMA node, then we can adjust the // instruction to include the extra negation. if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: - return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FMSUB: - return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMADD: - return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMSUB: - return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); } } return SDValue(); @@ -26631,9 +26666,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -26732,18 +26765,18 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 16b1f3b59b0d..d15d0dc96e6f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5605,6 +5605,29 @@ let Predicates = [HasAVX512] in { EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } } + +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr> { + def rb: AVX512, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[WriteFAdd]>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} + let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, "ucomiss">, PS, EVEX, VEX_LIG, @@ -5911,12 +5934,12 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr,X86VectorVTInfo _, EVEX_B, EVEX_RC; let isCodeGenOnly = 1 in { - def r : SI; let mayLoad = 1 in - def m : SI; } diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 51648c6c567e..03ae21125b0e 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. -multiclass FPBinary { +multiclass FPBinary { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, - (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i16)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i32)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{l}\t$src")>; } let Defs = [FPSW] in { @@ -213,14 +243,14 @@ defm DIV : FPBinary_rr; let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary; defm SUB : FPBinary; -defm SUBR: FPBinary; +defm SUBR: FPBinary; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary; -defm DIVR: FPBinary; +defm DIVR: FPBinary; } } @@ -306,7 +336,7 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; -def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">; def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 25f247e9d620..b456460a5bb5 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,6 +38,8 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; +def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -66,7 +68,9 @@ def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index aaeef465bf50..12da3a9319e6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3517,23 +3517,23 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, bool IsIntrinOpcode; isFMA3(Opc, &IsIntrinOpcode); - unsigned GroupsNum; + size_t GroupsNum; const unsigned (*OpcodeGroups)[3]; if (IsIntrinOpcode) { - GroupsNum = sizeof(IntrinOpcodeGroups) / sizeof(IntrinOpcodeGroups[0]); + GroupsNum = array_lengthof(IntrinOpcodeGroups); OpcodeGroups = IntrinOpcodeGroups; } else { - GroupsNum = sizeof(RegularOpcodeGroups) / sizeof(RegularOpcodeGroups[0]); + GroupsNum = array_lengthof(RegularOpcodeGroups); OpcodeGroups = RegularOpcodeGroups; } const unsigned *FoundOpcodesGroup = nullptr; - unsigned FormIndex; + size_t FormIndex; // Look for the input opcode in the corresponding opcodes table. - unsigned GroupIndex = 0; - for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) { - for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) { + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { FoundOpcodesGroup = OpcodeGroups[GroupIndex]; break; @@ -6715,16 +6715,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 80e55d04c1f3..bb2f7248b0e9 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -20,7 +20,7 @@ enum IntrinsicType { INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, - CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, + CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, @@ -1630,6 +1630,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -1821,6 +1823,95 @@ static void verifyIntrinsicTables() { "Intrinsic data tables should have unique entries"); } +// X86 specific compare constants. +// They must be kept in synch with avxintrin.h +#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/* +* Get comparison modifier from _mm_comi_round_sd/ss intrinsic +* Return tuple +*/ +static std::tuple TranslateX86ConstCondToX86CC(SDValue &imm) { + ConstantSDNode *CImm = dyn_cast(imm); + unsigned IntImm = CImm->getZExtValue(); + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (IntImm) { + default: llvm_unreachable("Invalid floating point compare value for Comi!"); + case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling) + case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling) + return std::make_tuple(true, X86::COND_E); + case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling) + case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling) + return std::make_tuple(false , X86::COND_E); + case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling) + case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_B); + case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling) + case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false , X86::COND_B); + case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling) + case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_BE); + case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling) + case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_BE); + case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling) + case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_A); + case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling) + case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_A); + case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling) + case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_AE); + case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling) + case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_AE); + case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling) + case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling) + return std::make_tuple(true, X86::COND_NE); + case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling) + case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling) + return std::make_tuple(false, X86::COND_NE); + } +} + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 12f38c7946a8..ceeb57d0cc4c 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, - R8, R9, R10, R11)>; + R8, R9, R10, R11, RIP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 5d6088439578..fd896c2857f6 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -354,9 +354,11 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasXSAVEC() const { return HasXSAVEC; } bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } - bool hasFMA() const { return HasFMA; } - // FIXME: Favor FMA when both are enabled. Is this the right thing to do? - bool hasFMA4() const { return HasFMA4 && !HasFMA; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index c648750b202e..cf7a826ea85d 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -528,17 +528,31 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const TypeConversionCostTblEntry AVX512ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, + }; + + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, - { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, - { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, // v16i1 -> v16i32 - load + broadcast { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, @@ -548,16 +562,46 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, + + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, }; static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { @@ -693,12 +737,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return LTSrc.first * Entry->Cost; } - if (ST->hasAVX512()) { - if (const auto *Entry = ConvertCostTableLookup(AVX512ConversionTbl, ISD, - LTDest.second, LTSrc.second)) - return Entry->Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -706,6 +744,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + if (ST->hasAVX2()) { if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, DstTy.getSimpleVT(), diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 8a5aa40bc7f2..0276f3969b48 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -15,7 +15,7 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index ab0f7114957b..c2359a8a172e 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -51,7 +51,7 @@ static std::unique_ptr loadFile(const std::string &FileName, } // Get a Module for \p FileName from the cache, or load it lazily. -Module &FunctionImporter::getOrLoadModule(StringRef FileName) { +Module &ModuleLazyLoaderCache::operator()(StringRef FileName) { auto &Module = ModuleMap[FileName]; if (!Module) Module = loadFile(FileName, Context); @@ -66,7 +66,6 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions, for (auto &BB : F) { for (auto &I : BB) { if (isa(I)) { - DEBUG(dbgs() << "Found a call: '" << I << "'\n"); auto CalledFunction = cast(I).getCalledFunction(); // Insert any new external calls that have not already been // added to set/worklist. @@ -81,29 +80,14 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions, } } -// Automatically import functions in Module \p M based on the summaries index. -// -// The current implementation imports every called functions that exists in the -// summaries index. -bool FunctionImporter::importFunctions(Module &M) { - assert(&Context == &M.getContext()); - - bool Changed = false; - - /// First step is collecting the called external functions. - StringSet<> CalledFunctions; - SmallVector Worklist; - for (auto &F : M) { - if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone)) - continue; - findExternalCalls(F, CalledFunctions, Worklist); - } - - /// Second step: for every call to an external function, try to import it. - - // Linker that will be used for importing function - Linker L(&M, DiagnosticHandler); - +// Helper function: given a worklist and an index, will process all the worklist +// and import them based on the summary information +static unsigned ProcessImportWorklist( + Module &DestModule, SmallVector &Worklist, + StringSet<> &CalledFunctions, Linker &TheLinker, + const FunctionInfoIndex &Index, + std::function &LazyModuleLoader) { + unsigned ImportCount = 0; while (!Worklist.empty()) { auto CalledFunctionName = Worklist.pop_back_val(); DEBUG(dbgs() << "Process import for " << CalledFunctionName << "\n"); @@ -124,35 +108,32 @@ bool FunctionImporter::importFunctions(Module &M) { auto *Summary = Info->functionSummary(); if (!Summary) { // FIXME: in case we are lazyloading summaries, we can do it now. - dbgs() << "Missing summary for " << CalledFunctionName - << ", error at import?\n"; + DEBUG(dbgs() << "Missing summary for " << CalledFunctionName + << ", error at import?\n"); llvm_unreachable("Missing summary"); } if (Summary->instCount() > ImportInstrLimit) { - dbgs() << "Skip import of " << CalledFunctionName << " with " - << Summary->instCount() << " instructions (limit " - << ImportInstrLimit << ")\n"; + DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with " + << Summary->instCount() << " instructions (limit " + << ImportInstrLimit << ")\n"); continue; } - // - // No profitability notion right now, just import all the time... - // - // Get the module path from the summary. auto FileName = Summary->modulePath(); DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName << "\n"); // Get the module for the import (potentially from the cache). - auto &Module = getOrLoadModule(FileName); + auto &Module = LazyModuleLoader(FileName); + assert(&Module.getContext() == &DestModule.getContext()); // The function that we will import! GlobalValue *SGV = Module.getNamedValue(CalledFunctionName); StringRef ImportFunctionName = CalledFunctionName; if (!SGV) { - // Might be local in source Module, promoted/renamed in dest Module M. + // Might be local in source Module, promoted/renamed in DestModule. std::pair Split = CalledFunctionName.split(".llvm."); SGV = Module.getNamedValue(Split.first); @@ -186,19 +167,55 @@ bool FunctionImporter::importFunctions(Module &M) { } // Link in the specified function. - if (L.linkInModule(&Module, Linker::Flags::None, &Index, F)) + DenseSet FunctionsToImport; + FunctionsToImport.insert(F); + if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index, + &FunctionsToImport)) report_fatal_error("Function Import: link error"); // Process the newly imported function and add callees to the worklist. - GlobalValue *NewGV = M.getNamedValue(ImportFunctionName); + GlobalValue *NewGV = DestModule.getNamedValue(ImportFunctionName); assert(NewGV); Function *NewF = dyn_cast(NewGV); assert(NewF); findExternalCalls(*NewF, CalledFunctions, Worklist); + ++ImportCount; + } + return ImportCount; +} + +// Automatically import functions in Module \p DestModule based on the summaries +// index. +// +// The current implementation imports every called functions that exists in the +// summaries index. +bool FunctionImporter::importFunctions(Module &DestModule) { + DEBUG(errs() << "Starting import for Module " + << DestModule.getModuleIdentifier() << "\n"); + unsigned ImportedCount = 0; - Changed = true; + /// First step is collecting the called external functions. + StringSet<> CalledFunctions; + SmallVector Worklist; + for (auto &F : DestModule) { + if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + findExternalCalls(F, CalledFunctions, Worklist); } - return Changed; + if (Worklist.empty()) + return false; + + /// Second step: for every call to an external function, try to import it. + + // Linker that will be used for importing function + Linker TheLinker(DestModule, DiagnosticHandler); + + ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions, + TheLinker, Index, getLazyModule); + + DEBUG(errs() << "Imported " << ImportedCount << " functions for Module " + << DestModule.getModuleIdentifier() << "\n"); + return ImportedCount; } /// Summary file to use for function importing when using -function-import from @@ -259,7 +276,10 @@ class FunctionImportPass : public ModulePass { } // Perform the import now. - FunctionImporter Importer(M.getContext(), *Index, diagnosticHandler); + ModuleLazyLoaderCache Loader(M.getContext()); + FunctionImporter Importer(*Index, diagnosticHandler, + [&](StringRef Name) + -> Module &{ return Loader(Name); }); return Importer.importFunctions(M); return false; diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index a0b740563693..714e1d6e42d2 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -21,7 +21,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index f72089e6c8ef..2bf6faa47b93 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1962,14 +1962,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, case ICmpInst::ICMP_EQ: if (LHS->getOperand(0) == RHS->getOperand(0)) { // if LHSCst and RHSCst differ only by one bit: - // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 + // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 assert(LHSCst->getValue().ule(LHSCst->getValue())); APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); if (Xor.isPowerOf2()) { - Value *NegCst = Builder->getInt(~Xor); - Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); - return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); + Value *Cst = Builder->getInt(Xor); + Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); + return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst); } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 26088bbe018a..d2341c8c05db 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1942,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. + SmallVector Indices; unsigned ArgNo = 0; + for (Value *V : CS.args()) { if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && - isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) { - AttributeSet AS = CS.getAttributes(); - AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1, - Attribute::NonNull); - CS.setAttributes(AS); - Changed = true; - } + isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) + Indices.push_back(ArgNo + 1); ArgNo++; } + assert(ArgNo == CS.arg_size() && "sanity check"); + if (!Indices.empty()) { + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, + Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + Changed = true; + } + // If the callee is a pointer to a function, attempt to move any casts to the // arguments of the call/invoke. Value *Callee = CS.getCalledValue(); diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index e95a65510ecc..74c5148f9f89 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,9 +42,9 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp index 6071ca5a8754..4441663fc6de 100644 --- a/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/lib/Transforms/Instrumentation/SafeStack.cpp @@ -57,6 +57,7 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions, STATISTIC(NumAllocas, "Total number of allocas"); STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas"); STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas"); +STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments"); STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads"); } // namespace llvm @@ -68,14 +69,14 @@ namespace { /// /// The implementation simply replaces all mentions of the alloca with zero. class AllocaOffsetRewriter : public SCEVRewriteVisitor { - const AllocaInst *AI; + const Value *AllocaPtr; public: - AllocaOffsetRewriter(ScalarEvolution &SE, const AllocaInst *AI) - : SCEVRewriteVisitor(SE), AI(AI) {} + AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) + : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} const SCEV *visitUnknown(const SCEVUnknown *Expr) { - if (Expr->getValue() == AI) + if (Expr->getValue() == AllocaPtr) return SE.getZero(Expr->getType()); return Expr; } @@ -115,9 +116,14 @@ class SafeStack : public FunctionPass { /// given function and append them to the respective vectors. void findInsts(Function &F, SmallVectorImpl &StaticAllocas, SmallVectorImpl &DynamicAllocas, + SmallVectorImpl &ByValArguments, SmallVectorImpl &Returns, SmallVectorImpl &StackRestorePoints); + /// \brief Calculate the allocation size of a given alloca. Returns 0 if the + /// size can not be statically determined. + uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI); + /// \brief Allocate space for all static allocas in \p StaticAllocas, /// replace allocas with pointers into the unsafe stack and generate code to /// restore the stack pointer before all return instructions in \p Returns. @@ -126,6 +132,7 @@ class SafeStack : public FunctionPass { /// allocas are allocated. Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, ArrayRef StaticAllocas, + ArrayRef ByValArguments, ArrayRef Returns); /// \brief Generate code to restore the stack after all stack restore points @@ -145,11 +152,12 @@ class SafeStack : public FunctionPass { AllocaInst *DynamicTop, ArrayRef DynamicAllocas); - bool IsSafeStackAlloca(const AllocaInst *AI); + bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize); bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, - const AllocaInst *AI); - bool IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI); + const Value *AllocaPtr, uint64_t AllocaSize); + bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr, + uint64_t AllocaSize); public: static char ID; // Pass identification, replacement for typeid. @@ -177,21 +185,34 @@ class SafeStack : public FunctionPass { bool runOnFunction(Function &F) override; }; // class SafeStack -bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { - AllocaOffsetRewriter Rewriter(*SE, AI); +uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { + uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + auto C = dyn_cast(AI->getArraySize()); + if (!C) + return 0; + Size *= C->getZExtValue(); + } + return Size; +} + +bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, + const Value *AllocaPtr, uint64_t AllocaSize) { + AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); ConstantRange SizeRange = - ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size)); + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); - ConstantRange AllocaRange = ConstantRange( - APInt(BitWidth, 0), - APInt(BitWidth, DL->getTypeStoreSize(AI->getAllocatedType()))); + ConstantRange AllocaRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize)); bool Safe = AllocaRange.contains(AccessRange); - DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n" + DEBUG(dbgs() << "[SafeStack] " + << (isa(AllocaPtr) ? "Alloca " : "ByValArgument ") + << *AllocaPtr << "\n" << " Access " << *Addr << "\n" << " SCEV " << *Expr << " U: " << SE->getUnsignedRange(Expr) @@ -204,36 +225,38 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { } bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, - const AllocaInst *AI) { + const Value *AllocaPtr, + uint64_t AllocaSize) { // All MemIntrinsics have destination address in Arg0 and size in Arg2. if (MI->getRawDest() != U) return true; const auto *Len = dyn_cast(MI->getLength()); // Non-constant size => unsafe. FIXME: try SCEV getRange. if (!Len) return false; - return IsAccessSafe(U, Len->getZExtValue(), AI); + return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize); } -/// Check whether a given alloca instruction (AI) should be put on the safe +/// Check whether a given allocation must be put on the safe /// stack or not. The function analyzes all uses of AI and checks whether it is /// only accessed in a memory safe way (as decided statically). -bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { +bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { // Go through all uses of this alloca and check whether all accesses to the // allocated object are statically known to be memory safe and, hence, the // object can be placed on the safe stack. SmallPtrSet Visited; - SmallVector WorkList; - WorkList.push_back(AI); + SmallVector WorkList; + WorkList.push_back(AllocaPtr); // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. while (!WorkList.empty()) { - const Instruction *V = WorkList.pop_back_val(); + const Value *V = WorkList.pop_back_val(); for (const Use &UI : V->uses()) { auto I = cast(UI.getUser()); assert(V == UI.get()); switch (I->getOpcode()) { case Instruction::Load: { - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AI)) + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + AllocaSize)) return false; break; } @@ -243,13 +266,13 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { case Instruction::Store: { if (V == I->getOperand(0)) { // Stored the pointer - conservatively assume it may be unsafe. - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n store of address: " << *I << "\n"); return false; } - if (!IsAccessSafe( - UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), AI)) + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + AllocaPtr, AllocaSize)) return false; break; } @@ -269,8 +292,8 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { } if (const MemIntrinsic *MI = dyn_cast(I)) { - if (!IsMemIntrinsicSafe(MI, UI, AI)) { - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n unsafe memintrinsic: " << *I << "\n"); return false; @@ -288,9 +311,9 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) if (A->get() == V) - if (!(CS.doesNotCapture(A - B) && - (CS.doesNotAccessMemory(A - B) || CS.doesNotAccessMemory()))) { - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) || + CS.doesNotAccessMemory()))) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n unsafe call: " << *I << "\n"); return false; } @@ -341,13 +364,15 @@ Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) { void SafeStack::findInsts(Function &F, SmallVectorImpl &StaticAllocas, SmallVectorImpl &DynamicAllocas, + SmallVectorImpl &ByValArguments, SmallVectorImpl &Returns, SmallVectorImpl &StackRestorePoints) { for (Instruction &I : instructions(&F)) { if (auto AI = dyn_cast(&I)) { ++NumAllocas; - if (IsSafeStackAlloca(AI)) + uint64_t Size = getStaticAllocaAllocationSize(AI); + if (IsSafeStackAlloca(AI, Size)) continue; if (AI->isStaticAlloca()) { @@ -372,6 +397,17 @@ void SafeStack::findInsts(Function &F, "gcroot intrinsic not compatible with safestack attribute"); } } + for (Argument &Arg : F.args()) { + if (!Arg.hasByValAttr()) + continue; + uint64_t Size = + DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + if (IsSafeStackAlloca(&Arg, Size)) + continue; + + ++NumUnsafeByValArguments; + ByValArguments.push_back(&Arg); + } } AllocaInst * @@ -406,7 +442,7 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, for (Instruction *I : StackRestorePoints) { ++NumUnsafeStackRestorePoints; - IRB.SetInsertPoint(cast(I->getNextNode())); + IRB.SetInsertPoint(I->getNextNode()); Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop; IRB.CreateStore(CurrentTop, UnsafeStackPtr); } @@ -414,11 +450,10 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, return DynamicTop; } -Value * -SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, - ArrayRef StaticAllocas, - ArrayRef Returns) { - if (StaticAllocas.empty()) +Value *SafeStack::moveStaticAllocasToUnsafeStack( + IRBuilder<> &IRB, Function &F, ArrayRef StaticAllocas, + ArrayRef ByValArguments, ArrayRef Returns) { + if (StaticAllocas.empty() && ByValArguments.empty()) return nullptr; DIBuilder DIB(*F.getParent()); @@ -440,6 +475,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, // Compute maximum alignment among static objects on the unsafe stack. unsigned MaxAlignment = 0; + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + if (Align > MaxAlignment) + MaxAlignment = Align; + } for (AllocaInst *AI : StaticAllocas) { Type *Ty = AI->getAllocatedType(); unsigned Align = @@ -451,22 +493,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, if (MaxAlignment > StackAlignment) { // Re-align the base pointer according to the max requested alignment. assert(isPowerOf2_32(MaxAlignment)); - IRB.SetInsertPoint(cast(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); BasePointer = cast(IRB.CreateIntToPtr( IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy), ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))), StackPtrTy)); } - // Allocate space for every unsafe static AllocaInst on the unsafe stack. int64_t StaticOffset = 0; // Current stack top. + IRB.SetInsertPoint(BasePointer->getNextNode()); + + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + + uint64_t Size = DL->getTypeStoreSize(Ty); + if (Size == 0) + Size = 1; // Don't create zero-sized stack objects. + + // Ensure the object is properly aligned. + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + + // Add alignment. + // NOTE: we ensure that BasePointer itself is aligned to >= Align. + StaticOffset += Size; + StaticOffset = RoundUpToAlignment(StaticOffset, Align); + + Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8* + ConstantInt::get(Int32Ty, -StaticOffset)); + Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(), + Arg->getName() + ".unsafe-byval"); + + // Replace alloc with the new location. + replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, + /*Deref=*/true, -StaticOffset); + Arg->replaceAllUsesWith(NewArg); + IRB.SetInsertPoint(cast(NewArg)->getNextNode()); + IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); + } + + // Allocate space for every unsafe static AllocaInst on the unsafe stack. for (AllocaInst *AI : StaticAllocas) { IRB.SetInsertPoint(AI); - auto CArraySize = cast(AI->getArraySize()); Type *Ty = AI->getAllocatedType(); - - uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue(); + uint64_t Size = getStaticAllocaAllocationSize(AI); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -497,7 +568,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment); // Update shadow stack pointer in the function epilogue. - IRB.SetInsertPoint(cast(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); Value *StaticTop = IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset), @@ -609,6 +680,7 @@ bool SafeStack::runOnFunction(Function &F) { SmallVector StaticAllocas; SmallVector DynamicAllocas; + SmallVector ByValArguments; SmallVector Returns; // Collect all points where stack gets unwound and needs to be restored @@ -620,13 +692,15 @@ bool SafeStack::runOnFunction(Function &F) { // Find all static and dynamic alloca instructions that must be moved to the // unsafe stack, all return instructions and stack restore points. - findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints); + findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns, + StackRestorePoints); if (StaticAllocas.empty() && DynamicAllocas.empty() && - StackRestorePoints.empty()) + ByValArguments.empty() && StackRestorePoints.empty()) return false; // Nothing to do in this function. - if (!StaticAllocas.empty() || !DynamicAllocas.empty()) + if (!StaticAllocas.empty() || !DynamicAllocas.empty() || + !ByValArguments.empty()) ++NumUnsafeStackFunctions; // This function has the unsafe stack. if (!StackRestorePoints.empty()) @@ -636,7 +710,8 @@ bool SafeStack::runOnFunction(Function &F) { UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F); // The top of the unsafe stack after all unsafe static allocas are allocated. - Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, Returns); + Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, + ByValArguments, Returns); // Safe stack object that stores the current unsafe stack top. It is updated // as unsafe dynamic (non-constant-sized) allocas are allocated and freed. diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 3430ee01035e..cbdcc5e6cfe1 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -31,7 +31,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -156,7 +156,9 @@ class SanitizerCoverageModule : public ModulePass { void SetNoSanitizeMetadata(Instruction *I); void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls); unsigned NumberOfInstrumentedBlocks() { - return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses(); + return SanCovFunction->getNumUses() + + SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() + + SanCovTraceEnter->getNumUses(); } Function *SanCovFunction; Function *SanCovWithCheckFunction; @@ -211,12 +213,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { StringRef(""), StringRef(""), /*hasSideEffects=*/true); - if (Options.TraceBB) { - SanCovTraceEnter = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); - SanCovTraceBB = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); - } + SanCovTraceEnter = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); + SanCovTraceBB = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); // At this point we create a dummy array of guards because we don't // know how many elements we will need. @@ -466,7 +466,9 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - if (UseCalls) { + if (Options.TraceBB) { + IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); + } else if (UseCalls) { IRB.CreateCall(SanCovWithCheckFunction, GuardP); } else { LoadInst *Load = IRB.CreateLoad(GuardP); @@ -495,13 +497,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, SetNoSanitizeMetadata(LI); SetNoSanitizeMetadata(SI); } - - if (Options.TraceBB) { - // Experimental support for tracing. - // Insert a callback with the same guard variable as used for coverage. - IRB.SetInsertPoint(&*IP); - IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); - } } char SanitizerCoverageModule::ID = 0; diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index d7e02b16a28e..686bd4071104 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -307,27 +307,31 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { /// processCallSite - Infer nonnull attributes for the arguments at the /// specified callsite. bool CorrelatedValuePropagation::processCallSite(CallSite CS) { - bool Changed = false; - + SmallVector Indices; unsigned ArgNo = 0; + for (Value *V : CS.args()) { PointerType *Type = dyn_cast(V->getType()); if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), - CS.getInstruction()) == LazyValueInfo::False) { - AttributeSet AS = CS.getAttributes(); - AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo + 1, - Attribute::NonNull); - CS.setAttributes(AS); - Changed = true; - } + CS.getInstruction()) == LazyValueInfo::False) + Indices.push_back(ArgNo + 1); ArgNo++; } + assert(ArgNo == CS.arg_size() && "sanity check"); - return Changed; + if (Indices.empty()) + return false; + + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + + return true; } Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a3658ed64976..9dc41ba2f328 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -257,6 +257,10 @@ static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { } bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) { + // Don't touch volatile stores. + if (!SI->isSimple()) + return false; + Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -287,10 +291,6 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { if (!SI) continue; - // Don't touch volatile stores. - if (!SI->isSimple()) - continue; - // Make sure this is a strided store with a constant stride. if (!isLegalStore(SI)) continue; diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 0bd5fa9f8777..ba79b32ac3d5 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -20,8 +20,8 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -1136,9 +1136,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { return nullptr; } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder, bool Deref, int Offset) { - DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); +bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset) { + DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address); if (!DDI) return false; DebugLoc Loc = DDI->getDebugLoc(); @@ -1168,12 +1169,17 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, // Insert llvm.dbg.declare immediately after the original alloca, and remove // old llvm.dbg.declare. - Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, - AI->getNextNode()); + Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); DDI->eraseFromParent(); return true; } +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, + DIBuilder &Builder, bool Deref, int Offset) { + return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, + Deref, Offset); +} + /// changeToUnreachable - Insert an unreachable instruction before the specified /// instruction, making it and the rest of the code in the block dead. static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 47e587fab7b6..83afb1a65ac0 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1322,6 +1322,15 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { return B.CreateFMul(OpC->getArgOperand(1), EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, Callee->getAttributes()), "mul"); + + // log(exp2(y)) -> y*log(2) + if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && + TLI->has(Func) && Func == LibFunc::exp2) + return B.CreateFMul( + OpC->getArgOperand(0), + EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), + Callee->getName(), B, Callee->getAttributes()), + "logmul"); return Ret; } @@ -2301,7 +2310,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // log, logf, logl: // * log(exp(x)) -> x // * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) // * log(exp10(y)) -> y*log(10) // * log(sqrt(x)) -> 0.5*log(x) // diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 0a63c1d5153c..00a8984845dd 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Value *NewV = Materializer->materializeDeclFor(const_cast(V))) { VM[V] = NewV; - if (auto *GV = dyn_cast(V)) - Materializer->materializeInitFor(cast(NewV), - const_cast(GV)); + if (auto *NewGV = dyn_cast(NewV)) + Materializer->materializeInitFor( + NewGV, const_cast(cast(V))); return NewV; } } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c956a55a1009..c5b8b5b073d6 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5188,7 +5188,7 @@ LoopVectorizationCostModel::calculateRegisterUsage( continue; } - // Count the number of live interals. + // Count the number of live intevals. unsigned RegUsage = 0; for (auto Inst : OpenIntervals) RegUsage += GetRegUsage(Inst->getType(), VFs[j]); diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll new file mode 100644 index 000000000000..c328d7686466 --- /dev/null +++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -0,0 +1,110 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +; CHECK: 'extractelement_v2i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32> +define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { + %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr + %elt = extractelement <2 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2f32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float> +define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { + %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr + %elt = extractelement <2 x float> %vec, i32 1 + store float %elt, float addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v3i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32> +define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { + %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr + %elt = extractelement <3 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32> +define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { + %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr + %elt = extractelement <4 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v8i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32> +define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { + %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr + %elt = extractelement <8 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should be non-0 +; CHECK: 'extractelement_v8i32_dynindex' +; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32> +define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { + %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr + %elt = extractelement <8 x i32> %vec, i32 %idx + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64> +define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { + %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr + %elt = extractelement <2 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v3i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64> +define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { + %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr + %elt = extractelement <3 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64> +define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { + %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr + %elt = extractelement <4 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v8i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64> +define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { + %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr + %elt = extractelement <8 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i8' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8> +define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { + %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr + %elt = extractelement <4 x i8> %vec, i8 1 + store i8 %elt, i8 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2i16' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16> +define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %elt = extractelement <2 x i16> %vec, i16 1 + store i16 %elt, i16 addrspace(1)* %out + ret void +} diff --git a/test/Analysis/CostModel/AMDGPU/lit.local.cfg b/test/Analysis/CostModel/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..2a665f06be72 --- /dev/null +++ b/test/Analysis/CostModel/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll index fb16af635f07..c518587c0e1a 100644 --- a/test/Analysis/CostModel/X86/cast.ll +++ b/test/Analysis/CostModel/X86/cast.ll @@ -84,11 +84,11 @@ define i32 @zext_sext(<8 x i1> %in) { ;CHECK-AVX: cost of 4 {{.*}} zext %D = zext <4 x i32> undef to <4 x i64> - ;CHECK-AVX512: cost of 3 {{.*}} %D1 = zext - %D1 = zext <16 x i32> undef to <16 x i64> + ;CHECK-AVX512: cost of 1 {{.*}} %D1 = zext + %D1 = zext <8 x i32> undef to <8 x i64> - ;CHECK-AVX512: cost of 3 {{.*}} %D2 = sext - %D2 = sext <16 x i32> undef to <16 x i64> + ;CHECK-AVX512: cost of 1 {{.*}} %D2 = sext + %D2 = sext <8 x i32> undef to <8 x i64> ;CHECK-AVX512: cost of 1 {{.*}} %D3 = zext %D3 = zext <16 x i16> undef to <16 x i32> @@ -118,9 +118,11 @@ define i32 @zext_sext(<8 x i1> %in) { ;CHECK_AVX512: cost of 1 {{.*}} G = trunc %G = trunc <8 x i64> undef to <8 x i32> - ;CHECK-AVX512: cost of 4 {{.*}} %G1 = trunc - %G1 = trunc <16 x i64> undef to <16 x i32> + ;CHECK-AVX512: cost of 1 {{.*}} %G1 = trunc + %G1 = trunc <16 x i32> undef to <16 x i16> + ;CHECK-AVX512: cost of 1 {{.*}} %G2 = trunc + %G2 = trunc <16 x i32> undef to <16 x i8> ret i32 undef } @@ -207,38 +209,40 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { ; CHECK: cost of 2 {{.*}} uitofp %C2 = uitofp <4 x i16> %c to <4 x double> - ; CHECK: cost of 6 {{.*}} uitofp + ; CHECK-AVX2: cost of 6 {{.*}} uitofp %D1 = uitofp <4 x i32> %d to <4 x float> - ; CHECK: cost of 6 {{.*}} uitofp + ; CHECK-AVX2: cost of 6 {{.*}} uitofp %D2 = uitofp <4 x i32> %d to <4 x double> ret void } define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { ; CHECK-LABEL: for function 'uitofp8' - ; CHECK: cost of 6 {{.*}} uitofp + ; CHECK-AVX2: cost of 6 {{.*}} uitofp %A1 = uitofp <8 x i1> %a to <8 x float> - ; CHECK: cost of 5 {{.*}} uitofp + ; CHECK-AVX2: cost of 5 {{.*}} uitofp + ; CHECK-AVX512: cost of 2 {{.*}} uitofp %B1 = uitofp <8 x i8> %b to <8 x float> - ; CHECK: cost of 5 {{.*}} uitofp + ; CHECK-AVX2: cost of 5 {{.*}} uitofp + ; CHECK-AVX512: cost of 2 {{.*}} uitofp %C1 = uitofp <8 x i16> %c to <8 x float> ; CHECK-AVX2: cost of 8 {{.*}} uitofp - ; CHECK-AVX512: cost of 8 {{.*}} uitofp + ; CHECK-AVX512: cost of 1 {{.*}} uitofp ; CHECK-AVX: cost of 9 {{.*}} uitofp %D1 = uitofp <8 x i32> %d to <8 x float> ret void } -define void @fp_conv(<8 x float> %a, <16 x float>%b) { +define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) { ;CHECK-LABEL: for function 'fp_conv' ; CHECK-AVX512: cost of 1 {{.*}} fpext %A1 = fpext <8 x float> %a to <8 x double> - ; CHECK-AVX512: cost of 3 {{.*}} fpext - %A2 = fpext <16 x float> %b to <16 x double> + ; CHECK-AVX512: cost of 1 {{.*}} fpext + %A2 = fpext <4 x float> %c to <4 x double> ; CHECK-AVX2: cost of 3 {{.*}} %A3 = fpext ; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext @@ -248,7 +252,7 @@ define void @fp_conv(<8 x float> %a, <16 x float>%b) { ; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc %A4 = fptrunc <8 x double> undef to <8 x float> - ; CHECK-AVX512: cost of 3 {{.*}} %A5 = fptrunc - %A5 = fptrunc <16 x double> undef to <16 x float> + ; CHECK-AVX512: cost of 1 {{.*}} %A5 = fptrunc + %A5 = fptrunc <4 x double> undef to <4 x float> ret void } diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll index dcd0088d0df7..9913a4896912 100644 --- a/test/Analysis/CostModel/X86/sitofp.ll +++ b/test/Analysis/CostModel/X86/sitofp.ll @@ -4,656 +4,656 @@ ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) { - ; SSE2: sitofpv2i8v2double + ; SSE2-LABEL: sitofpv2i8v2double ; SSE2: cost of 20 {{.*}} sitofp ; - ; AVX1: sitofpv2i8v2double + ; AVX1-LABEL: sitofpv2i8v2double ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i8v2double + ; AVX2-LABEL: sitofpv2i8v2double ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i8v2double + ; AVX512F-LABEL: sitofpv2i8v2double ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i8> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i8v4double(<4 x i8> %a) { - ; SSE2: sitofpv4i8v4double + ; SSE2-LABEL: sitofpv4i8v4double ; SSE2: cost of 40 {{.*}} sitofp ; - ; AVX1: sitofpv4i8v4double + ; AVX1-LABEL: sitofpv4i8v4double ; AVX1: cost of 3 {{.*}} sitofp ; - ; AVX2: sitofpv4i8v4double + ; AVX2-LABEL: sitofpv4i8v4double ; AVX2: cost of 3 {{.*}} sitofp ; - ; AVX512F: sitofpv4i8v4double + ; AVX512F-LABEL: sitofpv4i8v4double ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @sitofpv8i8v8double(<8 x i8> %a) { - ; SSE2: sitofpv8i8v8double + ; SSE2-LABEL: sitofpv8i8v8double ; SSE2: cost of 80 {{.*}} sitofp ; - ; AVX1: sitofpv8i8v8double + ; AVX1-LABEL: sitofpv8i8v8double ; AVX1: cost of 20 {{.*}} sitofp ; - ; AVX2: sitofpv8i8v8double + ; AVX2-LABEL: sitofpv8i8v8double ; AVX2: cost of 20 {{.*}} sitofp ; - ; AVX512F: sitofpv8i8v8double + ; AVX512F-LABEL: sitofpv8i8v8double ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @sitofpv16i8v16double(<16 x i8> %a) { - ; SSE2: sitofpv16i8v16double + ; SSE2-LABEL: sitofpv16i8v16double ; SSE2: cost of 160 {{.*}} sitofp ; - ; AVX1: sitofpv16i8v16double + ; AVX1-LABEL: sitofpv16i8v16double ; AVX1: cost of 40 {{.*}} sitofp ; - ; AVX2: sitofpv16i8v16double + ; AVX2-LABEL: sitofpv16i8v16double ; AVX2: cost of 40 {{.*}} sitofp ; - ; AVX512F: sitofpv16i8v16double + ; AVX512F-LABEL: sitofpv16i8v16double ; AVX512F: cost of 44 {{.*}} sitofp %1 = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i8v32double(<32 x i8> %a) { - ; SSE2: sitofpv32i8v32double + ; SSE2-LABEL: sitofpv32i8v32double ; SSE2: cost of 320 {{.*}} sitofp ; - ; AVX1: sitofpv32i8v32double + ; AVX1-LABEL: sitofpv32i8v32double ; AVX1: cost of 80 {{.*}} sitofp ; - ; AVX2: sitofpv32i8v32double + ; AVX2-LABEL: sitofpv32i8v32double ; AVX2: cost of 80 {{.*}} sitofp ; - ; AVX512F: sitofpv32i8v32double + ; AVX512F-LABEL: sitofpv32i8v32double ; AVX512F: cost of 88 {{.*}} sitofp %1 = sitofp <32 x i8> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @sitofpv2i16v2double(<2 x i16> %a) { - ; SSE2: sitofpv2i16v2double + ; SSE2-LABEL: sitofpv2i16v2double ; SSE2: cost of 20 {{.*}} sitofp ; - ; AVX1: sitofpv2i16v2double + ; AVX1-LABEL: sitofpv2i16v2double ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i16v2double + ; AVX2-LABEL: sitofpv2i16v2double ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i16v2double + ; AVX512F-LABEL: sitofpv2i16v2double ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i16> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i16v4double(<4 x i16> %a) { - ; SSE2: sitofpv4i16v4double + ; SSE2-LABEL: sitofpv4i16v4double ; SSE2: cost of 40 {{.*}} sitofp ; - ; AVX1: sitofpv4i16v4double + ; AVX1-LABEL: sitofpv4i16v4double ; AVX1: cost of 3 {{.*}} sitofp ; - ; AVX2: sitofpv4i16v4double + ; AVX2-LABEL: sitofpv4i16v4double ; AVX2: cost of 3 {{.*}} sitofp ; - ; AVX512F: sitofpv4i16v4double + ; AVX512F-LABEL: sitofpv4i16v4double ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <4 x i16> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @sitofpv8i16v8double(<8 x i16> %a) { - ; SSE2: sitofpv8i16v8double + ; SSE2-LABEL: sitofpv8i16v8double ; SSE2: cost of 80 {{.*}} sitofp ; - ; AVX1: sitofpv8i16v8double + ; AVX1-LABEL: sitofpv8i16v8double ; AVX1: cost of 20 {{.*}} sitofp ; - ; AVX2: sitofpv8i16v8double + ; AVX2-LABEL: sitofpv8i16v8double ; AVX2: cost of 20 {{.*}} sitofp ; - ; AVX512F: sitofpv8i16v8double + ; AVX512F-LABEL: sitofpv8i16v8double ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @sitofpv16i16v16double(<16 x i16> %a) { - ; SSE2: sitofpv16i16v16double + ; SSE2-LABEL: sitofpv16i16v16double ; SSE2: cost of 160 {{.*}} sitofp ; - ; AVX1: sitofpv16i16v16double + ; AVX1-LABEL: sitofpv16i16v16double ; AVX1: cost of 40 {{.*}} sitofp ; - ; AVX2: sitofpv16i16v16double + ; AVX2-LABEL: sitofpv16i16v16double ; AVX2: cost of 40 {{.*}} sitofp ; - ; AVX512F: sitofpv16i16v16double + ; AVX512F-LABEL: sitofpv16i16v16double ; AVX512F: cost of 44 {{.*}} sitofp %1 = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i16v32double(<32 x i16> %a) { - ; SSE2: sitofpv32i16v32double + ; SSE2-LABEL: sitofpv32i16v32double ; SSE2: cost of 320 {{.*}} sitofp ; - ; AVX1: sitofpv32i16v32double + ; AVX1-LABEL: sitofpv32i16v32double ; AVX1: cost of 80 {{.*}} sitofp ; - ; AVX2: sitofpv32i16v32double + ; AVX2-LABEL: sitofpv32i16v32double ; AVX2: cost of 80 {{.*}} sitofp ; - ; AVX512F: sitofpv32i16v32double + ; AVX512F-LABEL: sitofpv32i16v32double ; AVX512F: cost of 88 {{.*}} sitofp %1 = sitofp <32 x i16> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @sitofpv2i32v2double(<2 x i32> %a) { - ; SSE2: sitofpv2i32v2double + ; SSE2-LABEL: sitofpv2i32v2double ; SSE2: cost of 20 {{.*}} sitofp ; - ; AVX1: sitofpv2i32v2double + ; AVX1-LABEL: sitofpv2i32v2double ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i32v2double + ; AVX2-LABEL: sitofpv2i32v2double ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i32v2double + ; AVX512F-LABEL: sitofpv2i32v2double ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i32> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i32v4double(<4 x i32> %a) { - ; SSE2: sitofpv4i32v4double + ; SSE2-LABEL: sitofpv4i32v4double ; SSE2: cost of 40 {{.*}} sitofp ; - ; AVX1: sitofpv4i32v4double + ; AVX1-LABEL: sitofpv4i32v4double ; AVX1: cost of 1 {{.*}} sitofp ; - ; AVX2: sitofpv4i32v4double + ; AVX2-LABEL: sitofpv4i32v4double ; AVX2: cost of 1 {{.*}} sitofp ; - ; AVX512F: sitofpv4i32v4double + ; AVX512F-LABEL: sitofpv4i32v4double ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <4 x i32> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @sitofpv8i32v8double(<8 x i32> %a) { - ; SSE2: sitofpv8i32v8double + ; SSE2-LABEL: sitofpv8i32v8double ; SSE2: cost of 80 {{.*}} sitofp ; - ; AVX1: sitofpv8i32v8double + ; AVX1-LABEL: sitofpv8i32v8double ; AVX1: cost of 20 {{.*}} sitofp ; - ; AVX2: sitofpv8i32v8double + ; AVX2-LABEL: sitofpv8i32v8double ; AVX2: cost of 20 {{.*}} sitofp ; - ; AVX512F: sitofpv8i32v8double + ; AVX512F-LABEL: sitofpv8i32v8double ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <8 x i32> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @sitofpv16i32v16double(<16 x i32> %a) { - ; SSE2: sitofpv16i32v16double + ; SSE2-LABEL: sitofpv16i32v16double ; SSE2: cost of 160 {{.*}} sitofp ; - ; AVX1: sitofpv16i32v16double + ; AVX1-LABEL: sitofpv16i32v16double ; AVX1: cost of 40 {{.*}} sitofp ; - ; AVX2: sitofpv16i32v16double + ; AVX2-LABEL: sitofpv16i32v16double ; AVX2: cost of 40 {{.*}} sitofp ; - ; AVX512F: sitofpv16i32v16double + ; AVX512F-LABEL: sitofpv16i32v16double ; AVX512F: cost of 44 {{.*}} sitofp %1 = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i32v32double(<32 x i32> %a) { - ; SSE2: sitofpv32i32v32double + ; SSE2-LABEL: sitofpv32i32v32double ; SSE2: cost of 320 {{.*}} sitofp ; - ; AVX1: sitofpv32i32v32double + ; AVX1-LABEL: sitofpv32i32v32double ; AVX1: cost of 80 {{.*}} sitofp ; - ; AVX2: sitofpv32i32v32double + ; AVX2-LABEL: sitofpv32i32v32double ; AVX2: cost of 80 {{.*}} sitofp ; - ; AVX512F: sitofpv32i32v32double + ; AVX512F-LABEL: sitofpv32i32v32double ; AVX512F: cost of 88 {{.*}} sitofp %1 = sitofp <32 x i32> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) { - ; SSE2: sitofpv2i64v2double + ; SSE2-LABEL: sitofpv2i64v2double ; SSE2: cost of 20 {{.*}} sitofp ; - ; AVX1: sitofpv2i64v2double + ; AVX1-LABEL: sitofpv2i64v2double ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i64v2double + ; AVX2-LABEL: sitofpv2i64v2double ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i64v2double + ; AVX512F-LABEL: sitofpv2i64v2double ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) { - ; SSE2: sitofpv4i64v4double + ; SSE2-LABEL: sitofpv4i64v4double ; SSE2: cost of 40 {{.*}} sitofp ; - ; AVX1: sitofpv4i64v4double + ; AVX1-LABEL: sitofpv4i64v4double ; AVX1: cost of 10 {{.*}} sitofp ; - ; AVX2: sitofpv4i64v4double + ; AVX2-LABEL: sitofpv4i64v4double ; AVX2: cost of 10 {{.*}} sitofp ; - ; AVX512F: sitofpv4i64v4double + ; AVX512F-LABEL: sitofpv4i64v4double ; AVX512F: cost of 10 {{.*}} sitofp %1 = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) { - ; SSE2: sitofpv8i64v8double + ; SSE2-LABEL: sitofpv8i64v8double ; SSE2: cost of 80 {{.*}} sitofp ; - ; AVX1: sitofpv8i64v8double + ; AVX1-LABEL: sitofpv8i64v8double ; AVX1: cost of 20 {{.*}} sitofp ; - ; AVX2: sitofpv8i64v8double + ; AVX2-LABEL: sitofpv8i64v8double ; AVX2: cost of 20 {{.*}} sitofp ; - ; AVX512F: sitofpv8i64v8double + ; AVX512F-LABEL: sitofpv8i64v8double ; AVX512F: cost of 22 {{.*}} sitofp %1 = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) { - ; SSE2: sitofpv16i64v16double + ; SSE2-LABEL: sitofpv16i64v16double ; SSE2: cost of 160 {{.*}} sitofp ; - ; AVX1: sitofpv16i64v16double + ; AVX1-LABEL: sitofpv16i64v16double ; AVX1: cost of 40 {{.*}} sitofp ; - ; AVX2: sitofpv16i64v16double + ; AVX2-LABEL: sitofpv16i64v16double ; AVX2: cost of 40 {{.*}} sitofp ; - ; AVX512F: sitofpv16i64v16double + ; AVX512F-LABEL: sitofpv16i64v16double ; AVX512F: cost of 44 {{.*}} sitofp %1 = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) { - ; SSE2: sitofpv32i64v32double + ; SSE2-LABEL: sitofpv32i64v32double ; SSE2: cost of 320 {{.*}} sitofp ; - ; AVX1: sitofpv32i64v32double + ; AVX1-LABEL: sitofpv32i64v32double ; AVX1: cost of 80 {{.*}} sitofp ; - ; AVX2: sitofpv32i64v32double + ; AVX2-LABEL: sitofpv32i64v32double ; AVX2: cost of 80 {{.*}} sitofp ; - ; AVX512F: sitofpv32i64v32double + ; AVX512F-LABEL: sitofpv32i64v32double ; AVX512F: cost of 88 {{.*}} sitofp %1 = sitofp <32 x i64> %a to <32 x double> ret <32 x double> %1 } define <2 x float> @sitofpv2i8v2float(<2 x i8> %a) { - ; SSE2: sitofpv2i8v2float + ; SSE2-LABEL: sitofpv2i8v2float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv2i8v2float + ; AVX1-LABEL: sitofpv2i8v2float ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i8v2float + ; AVX2-LABEL: sitofpv2i8v2float ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i8v2float + ; AVX512F-LABEL: sitofpv2i8v2float ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i8v4float(<4 x i8> %a) { - ; SSE2: sitofpv4i8v4float + ; SSE2-LABEL: sitofpv4i8v4float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv4i8v4float + ; AVX1-LABEL: sitofpv4i8v4float ; AVX1: cost of 3 {{.*}} sitofp ; - ; AVX2: sitofpv4i8v4float + ; AVX2-LABEL: sitofpv4i8v4float ; AVX2: cost of 3 {{.*}} sitofp ; - ; AVX512F: sitofpv4i8v4float + ; AVX512F-LABEL: sitofpv4i8v4float ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <4 x i8> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @sitofpv8i8v8float(<8 x i8> %a) { - ; SSE2: sitofpv8i8v8float + ; SSE2-LABEL: sitofpv8i8v8float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv8i8v8float + ; AVX1-LABEL: sitofpv8i8v8float ; AVX1: cost of 8 {{.*}} sitofp ; - ; AVX2: sitofpv8i8v8float + ; AVX2-LABEL: sitofpv8i8v8float ; AVX2: cost of 8 {{.*}} sitofp ; - ; AVX512F: sitofpv8i8v8float + ; AVX512F-LABEL: sitofpv8i8v8float ; AVX512F: cost of 8 {{.*}} sitofp %1 = sitofp <8 x i8> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @sitofpv16i8v16float(<16 x i8> %a) { - ; SSE2: sitofpv16i8v16float + ; SSE2-LABEL: sitofpv16i8v16float ; SSE2: cost of 8 {{.*}} sitofp ; - ; AVX1: sitofpv16i8v16float + ; AVX1-LABEL: sitofpv16i8v16float ; AVX1: cost of 44 {{.*}} sitofp ; - ; AVX2: sitofpv16i8v16float + ; AVX2-LABEL: sitofpv16i8v16float ; AVX2: cost of 44 {{.*}} sitofp ; - ; AVX512F: sitofpv16i8v16float + ; AVX512F-LABEL: sitofpv16i8v16float ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @sitofpv32i8v32float(<32 x i8> %a) { - ; SSE2: sitofpv32i8v32float + ; SSE2-LABEL: sitofpv32i8v32float ; SSE2: cost of 16 {{.*}} sitofp ; - ; AVX1: sitofpv32i8v32float + ; AVX1-LABEL: sitofpv32i8v32float ; AVX1: cost of 88 {{.*}} sitofp ; - ; AVX2: sitofpv32i8v32float + ; AVX2-LABEL: sitofpv32i8v32float ; AVX2: cost of 88 {{.*}} sitofp ; - ; AVX512F: sitofpv32i8v32float + ; AVX512F-LABEL: sitofpv32i8v32float ; AVX512F: cost of 92 {{.*}} sitofp %1 = sitofp <32 x i8> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i16v2float(<2 x i16> %a) { - ; SSE2: sitofpv2i16v2float + ; SSE2-LABEL: sitofpv2i16v2float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv2i16v2float + ; AVX1-LABEL: sitofpv2i16v2float ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i16v2float + ; AVX2-LABEL: sitofpv2i16v2float ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i16v2float + ; AVX512F-LABEL: sitofpv2i16v2float ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i16> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i16v4float(<4 x i16> %a) { - ; SSE2: sitofpv4i16v4float + ; SSE2-LABEL: sitofpv4i16v4float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv4i16v4float + ; AVX1-LABEL: sitofpv4i16v4float ; AVX1: cost of 3 {{.*}} sitofp ; - ; AVX2: sitofpv4i16v4float + ; AVX2-LABEL: sitofpv4i16v4float ; AVX2: cost of 3 {{.*}} sitofp ; - ; AVX512F: sitofpv4i16v4float + ; AVX512F-LABEL: sitofpv4i16v4float ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <4 x i16> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @sitofpv8i16v8float(<8 x i16> %a) { - ; SSE2: sitofpv8i16v8float + ; SSE2-LABEL: sitofpv8i16v8float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv8i16v8float + ; AVX1-LABEL: sitofpv8i16v8float ; AVX1: cost of 5 {{.*}} sitofp ; - ; AVX2: sitofpv8i16v8float + ; AVX2-LABEL: sitofpv8i16v8float ; AVX2: cost of 5 {{.*}} sitofp ; - ; AVX512F: sitofpv8i16v8float + ; AVX512F-LABEL: sitofpv8i16v8float ; AVX512F: cost of 5 {{.*}} sitofp %1 = sitofp <8 x i16> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @sitofpv16i16v16float(<16 x i16> %a) { - ; SSE2: sitofpv16i16v16float + ; SSE2-LABEL: sitofpv16i16v16float ; SSE2: cost of 30 {{.*}} sitofp ; - ; AVX1: sitofpv16i16v16float + ; AVX1-LABEL: sitofpv16i16v16float ; AVX1: cost of 44 {{.*}} sitofp ; - ; AVX2: sitofpv16i16v16float + ; AVX2-LABEL: sitofpv16i16v16float ; AVX2: cost of 44 {{.*}} sitofp ; - ; AVX512F: sitofpv16i16v16float + ; AVX512F-LABEL: sitofpv16i16v16float ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @sitofpv32i16v32float(<32 x i16> %a) { - ; SSE2: sitofpv32i16v32float + ; SSE2-LABEL: sitofpv32i16v32float ; SSE2: cost of 60 {{.*}} sitofp ; - ; AVX1: sitofpv32i16v32float + ; AVX1-LABEL: sitofpv32i16v32float ; AVX1: cost of 88 {{.*}} sitofp ; - ; AVX2: sitofpv32i16v32float + ; AVX2-LABEL: sitofpv32i16v32float ; AVX2: cost of 88 {{.*}} sitofp ; - ; AVX512F: sitofpv32i16v32float - ; AVX512F: cost of 2 {{.*}} sitofp + ; AVX512F-LABEL: sitofpv32i16v32float + ; AVX512F: cost of 92 {{.*}} sitofp %1 = sitofp <32 x i16> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i32v2float(<2 x i32> %a) { - ; SSE2: sitofpv2i32v2float + ; SSE2-LABEL: sitofpv2i32v2float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv2i32v2float + ; AVX1-LABEL: sitofpv2i32v2float ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i32v2float + ; AVX2-LABEL: sitofpv2i32v2float ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i32v2float + ; AVX512F-LABEL: sitofpv2i32v2float ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i32v4float(<4 x i32> %a) { - ; SSE2: sitofpv4i32v4float + ; SSE2-LABEL: sitofpv4i32v4float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv4i32v4float + ; AVX1-LABEL: sitofpv4i32v4float ; AVX1: cost of 1 {{.*}} sitofp ; - ; AVX2: sitofpv4i32v4float + ; AVX2-LABEL: sitofpv4i32v4float ; AVX2: cost of 1 {{.*}} sitofp ; - ; AVX512F: sitofpv4i32v4float + ; AVX512F-LABEL: sitofpv4i32v4float ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <4 x i32> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @sitofpv8i32v8float(<8 x i32> %a) { - ; SSE2: sitofpv8i32v8float + ; SSE2-LABEL: sitofpv8i32v8float ; SSE2: cost of 30 {{.*}} sitofp ; - ; AVX1: sitofpv8i32v8float + ; AVX1-LABEL: sitofpv8i32v8float ; AVX1: cost of 1 {{.*}} sitofp ; - ; AVX2: sitofpv8i32v8float + ; AVX2-LABEL: sitofpv8i32v8float ; AVX2: cost of 1 {{.*}} sitofp ; - ; AVX512F: sitofpv8i32v8float + ; AVX512F-LABEL: sitofpv8i32v8float ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <8 x i32> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @sitofpv16i32v16float(<16 x i32> %a) { - ; SSE2: sitofpv16i32v16float + ; SSE2-LABEL: sitofpv16i32v16float ; SSE2: cost of 60 {{.*}} sitofp ; - ; AVX1: sitofpv16i32v16float + ; AVX1-LABEL: sitofpv16i32v16float ; AVX1: cost of 44 {{.*}} sitofp ; - ; AVX2: sitofpv16i32v16float + ; AVX2-LABEL: sitofpv16i32v16float ; AVX2: cost of 44 {{.*}} sitofp ; - ; AVX512F: sitofpv16i32v16float + ; AVX512F-LABEL: sitofpv16i32v16float ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @sitofpv32i32v32float(<32 x i32> %a) { - ; SSE2: sitofpv32i32v32float + ; SSE2-LABEL: sitofpv32i32v32float ; SSE2: cost of 120 {{.*}} sitofp ; - ; AVX1: sitofpv32i32v32float + ; AVX1-LABEL: sitofpv32i32v32float ; AVX1: cost of 88 {{.*}} sitofp ; - ; AVX2: sitofpv32i32v32float + ; AVX2-LABEL: sitofpv32i32v32float ; AVX2: cost of 88 {{.*}} sitofp ; - ; AVX512F: sitofpv32i32v32float - ; AVX512F: cost of 1 {{.*}} sitofp + ; AVX512F-LABEL: sitofpv32i32v32float + ; AVX512F: cost of 92 {{.*}} sitofp %1 = sitofp <32 x i32> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i64v2float(<2 x i64> %a) { - ; SSE2: sitofpv2i64v2float + ; SSE2-LABEL: sitofpv2i64v2float ; SSE2: cost of 15 {{.*}} sitofp ; - ; AVX1: sitofpv2i64v2float + ; AVX1-LABEL: sitofpv2i64v2float ; AVX1: cost of 4 {{.*}} sitofp ; - ; AVX2: sitofpv2i64v2float + ; AVX2-LABEL: sitofpv2i64v2float ; AVX2: cost of 4 {{.*}} sitofp ; - ; AVX512F: sitofpv2i64v2float + ; AVX512F-LABEL: sitofpv2i64v2float ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <2 x i64> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i64v4float(<4 x i64> %a) { - ; SSE2: sitofpv4i64v4float + ; SSE2-LABEL: sitofpv4i64v4float ; SSE2: cost of 30 {{.*}} sitofp ; - ; AVX1: sitofpv4i64v4float + ; AVX1-LABEL: sitofpv4i64v4float ; AVX1: cost of 10 {{.*}} sitofp ; - ; AVX2: sitofpv4i64v4float + ; AVX2-LABEL: sitofpv4i64v4float ; AVX2: cost of 10 {{.*}} sitofp ; - ; AVX512F: sitofpv4i64v4float + ; AVX512F-LABEL: sitofpv4i64v4float ; AVX512F: cost of 10 {{.*}} sitofp %1 = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @sitofpv8i64v8float(<8 x i64> %a) { - ; SSE2: sitofpv8i64v8float + ; SSE2-LABEL: sitofpv8i64v8float ; SSE2: cost of 60 {{.*}} sitofp ; - ; AVX1: sitofpv8i64v8float + ; AVX1-LABEL: sitofpv8i64v8float ; AVX1: cost of 22 {{.*}} sitofp ; - ; AVX2: sitofpv8i64v8float + ; AVX2-LABEL: sitofpv8i64v8float ; AVX2: cost of 22 {{.*}} sitofp ; - ; AVX512F: sitofpv8i64v8float + ; AVX512F-LABEL: sitofpv8i64v8float ; AVX512F: cost of 22 {{.*}} sitofp %1 = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @sitofpv16i64v16float(<16 x i64> %a) { - ; SSE2: sitofpv16i64v16float + ; SSE2-LABEL: sitofpv16i64v16float ; SSE2: cost of 120 {{.*}} sitofp ; - ; AVX1: sitofpv16i64v16float + ; AVX1-LABEL: sitofpv16i64v16float ; AVX1: cost of 44 {{.*}} sitofp ; - ; AVX2: sitofpv16i64v16float + ; AVX2-LABEL: sitofpv16i64v16float ; AVX2: cost of 44 {{.*}} sitofp ; - ; AVX512F: sitofpv16i64v16float + ; AVX512F-LABEL: sitofpv16i64v16float ; AVX512F: cost of 46 {{.*}} sitofp %1 = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @sitofpv32i64v32float(<32 x i64> %a) { - ; SSE2: sitofpv32i64v32float + ; SSE2-LABEL: sitofpv32i64v32float ; SSE2: cost of 240 {{.*}} sitofp ; - ; AVX1: sitofpv32i64v32float + ; AVX1-LABEL: sitofpv32i64v32float ; AVX1: cost of 88 {{.*}} sitofp ; - ; AVX2: sitofpv32i64v32float + ; AVX2-LABEL: sitofpv32i64v32float ; AVX2: cost of 88 {{.*}} sitofp ; - ; AVX512F: sitofpv32i64v32float + ; AVX512F-LABEL: sitofpv32i64v32float ; AVX512F: cost of 92 {{.*}} sitofp %1 = sitofp <32 x i64> %a to <32 x float> ret <32 x float> %1 } define <8 x double> @sitofpv8i1v8double(<8 x double> %a) { - ; SSE2: sitofpv8i1v8double + ; SSE2-LABEL: sitofpv8i1v8double ; SSE2: cost of 80 {{.*}} sitofp ; - ; AVX1: sitofpv8i1v8double + ; AVX1-LABEL: sitofpv8i1v8double ; AVX1: cost of 20 {{.*}} sitofp ; - ; AVX2: sitofpv8i1v8double + ; AVX2-LABEL: sitofpv8i1v8double ; AVX2: cost of 20 {{.*}} sitofp ; - ; AVX512F: sitofpv8i1v8double + ; AVX512F-LABEL: sitofpv8i1v8double ; AVX512F: cost of 4 {{.*}} sitofp %cmpres = fcmp ogt <8 x double> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x double> @@ -661,16 +661,16 @@ define <8 x double> @sitofpv8i1v8double(<8 x double> %a) { } define <16 x float> @sitofpv16i1v16float(<16 x float> %a) { - ; SSE2: sitofpv16i1v16float + ; SSE2-LABEL: sitofpv16i1v16float ; SSE2: cost of 8 {{.*}} sitofp ; - ; AVX1: sitofpv16i1v16float + ; AVX1-LABEL: sitofpv16i1v16float ; AVX1: cost of 44 {{.*}} sitofp ; - ; AVX2: sitofpv16i1v16float + ; AVX2-LABEL: sitofpv16i1v16float ; AVX2: cost of 44 {{.*}} sitofp ; - ; AVX512F: sitofpv16i1v16float + ; AVX512F-LABEL: sitofpv16i1v16float ; AVX512F: cost of 3 {{.*}} sitofp %cmpres = fcmp ogt <16 x float> %a, zeroinitializer %1 = sitofp <16 x i1> %cmpres to <16 x float> diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll index 9ffc483e3f5a..08e36650bec4 100644 --- a/test/Analysis/CostModel/X86/uitofp.ll +++ b/test/Analysis/CostModel/X86/uitofp.ll @@ -2,644 +2,708 @@ ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512DQ %s define <2 x double> @uitofpv2i8v2double(<2 x i8> %a) { - ; SSE2: uitofpv2i8v2double + ; SSE2-LABEL: uitofpv2i8v2double ; SSE2: cost of 20 {{.*}} uitofp ; - ; AVX1: uitofpv2i8v2double + ; AVX1-LABEL: uitofpv2i8v2double ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i8v2double + ; AVX2-LABEL: uitofpv2i8v2double ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i8v2double - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv2i8v2double + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <2 x i8> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i8v4double(<4 x i8> %a) { - ; SSE2: uitofpv4i8v4double + ; SSE2-LABEL: uitofpv4i8v4double ; SSE2: cost of 40 {{.*}} uitofp ; - ; AVX1: uitofpv4i8v4double + ; AVX1-LABEL: uitofpv4i8v4double ; AVX1: cost of 2 {{.*}} uitofp ; - ; AVX2: uitofpv4i8v4double + ; AVX2-LABEL: uitofpv4i8v4double ; AVX2: cost of 2 {{.*}} uitofp ; - ; AVX512F: uitofpv4i8v4double + ; AVX512F-LABEL: uitofpv4i8v4double ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @uitofpv8i8v8double(<8 x i8> %a) { - ; SSE2: uitofpv8i8v8double + ; SSE2-LABEL: uitofpv8i8v8double ; SSE2: cost of 80 {{.*}} uitofp ; - ; AVX1: uitofpv8i8v8double + ; AVX1-LABEL: uitofpv8i8v8double ; AVX1: cost of 20 {{.*}} uitofp ; - ; AVX2: uitofpv8i8v8double + ; AVX2-LABEL: uitofpv8i8v8double ; AVX2: cost of 20 {{.*}} uitofp ; - ; AVX512F: uitofpv8i8v8double - ; AVX512F: cost of 22 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i8v8double + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @uitofpv16i8v16double(<16 x i8> %a) { - ; SSE2: uitofpv16i8v16double + ; SSE2-LABEL: uitofpv16i8v16double ; SSE2: cost of 160 {{.*}} uitofp ; - ; AVX1: uitofpv16i8v16double + ; AVX1-LABEL: uitofpv16i8v16double ; AVX1: cost of 40 {{.*}} uitofp ; - ; AVX2: uitofpv16i8v16double + ; AVX2-LABEL: uitofpv16i8v16double ; AVX2: cost of 40 {{.*}} uitofp ; - ; AVX512F: uitofpv16i8v16double + ; AVX512F-LABEL: uitofpv16i8v16double ; AVX512F: cost of 44 {{.*}} uitofp %1 = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i8v32double(<32 x i8> %a) { - ; SSE2: uitofpv32i8v32double + ; SSE2-LABEL: uitofpv32i8v32double ; SSE2: cost of 320 {{.*}} uitofp ; - ; AVX1: uitofpv32i8v32double + ; AVX1-LABEL: uitofpv32i8v32double ; AVX1: cost of 80 {{.*}} uitofp ; - ; AVX2: uitofpv32i8v32double + ; AVX2-LABEL: uitofpv32i8v32double ; AVX2: cost of 80 {{.*}} uitofp ; - ; AVX512F: uitofpv32i8v32double + ; AVX512F-LABEL: uitofpv32i8v32double ; AVX512F: cost of 88 {{.*}} uitofp %1 = uitofp <32 x i8> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @uitofpv2i16v2double(<2 x i16> %a) { - ; SSE2: uitofpv2i16v2double + ; SSE2-LABEL: uitofpv2i16v2double ; SSE2: cost of 20 {{.*}} uitofp ; - ; AVX1: uitofpv2i16v2double + ; AVX1-LABEL: uitofpv2i16v2double ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i16v2double + ; AVX2-LABEL: uitofpv2i16v2double ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i16v2double - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv2i16v2double + ; AVX512F: cost of 5 {{.*}} uitofp %1 = uitofp <2 x i16> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i16v4double(<4 x i16> %a) { - ; SSE2: uitofpv4i16v4double + ; SSE2-LABEL: uitofpv4i16v4double ; SSE2: cost of 40 {{.*}} uitofp ; - ; AVX1: uitofpv4i16v4double + ; AVX1-LABEL: uitofpv4i16v4double ; AVX1: cost of 2 {{.*}} uitofp ; - ; AVX2: uitofpv4i16v4double + ; AVX2-LABEL: uitofpv4i16v4double ; AVX2: cost of 2 {{.*}} uitofp ; - ; AVX512F: uitofpv4i16v4double + ; AVX512F-LABEL: uitofpv4i16v4double ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <4 x i16> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @uitofpv8i16v8double(<8 x i16> %a) { - ; SSE2: uitofpv8i16v8double + ; SSE2-LABEL: uitofpv8i16v8double ; SSE2: cost of 80 {{.*}} uitofp ; - ; AVX1: uitofpv8i16v8double + ; AVX1-LABEL: uitofpv8i16v8double ; AVX1: cost of 20 {{.*}} uitofp ; - ; AVX2: uitofpv8i16v8double + ; AVX2-LABEL: uitofpv8i16v8double ; AVX2: cost of 20 {{.*}} uitofp ; - ; AVX512F: uitofpv8i16v8double - ; AVX512F: cost of 22 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i16v8double + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @uitofpv16i16v16double(<16 x i16> %a) { - ; SSE2: uitofpv16i16v16double + ; SSE2-LABEL: uitofpv16i16v16double ; SSE2: cost of 160 {{.*}} uitofp ; - ; AVX1: uitofpv16i16v16double + ; AVX1-LABEL: uitofpv16i16v16double ; AVX1: cost of 40 {{.*}} uitofp ; - ; AVX2: uitofpv16i16v16double + ; AVX2-LABEL: uitofpv16i16v16double ; AVX2: cost of 40 {{.*}} uitofp ; - ; AVX512F: uitofpv16i16v16double + ; AVX512F-LABEL: uitofpv16i16v16double ; AVX512F: cost of 44 {{.*}} uitofp %1 = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i16v32double(<32 x i16> %a) { - ; SSE2: uitofpv32i16v32double + ; SSE2-LABEL: uitofpv32i16v32double ; SSE2: cost of 320 {{.*}} uitofp ; - ; AVX1: uitofpv32i16v32double + ; AVX1-LABEL: uitofpv32i16v32double ; AVX1: cost of 80 {{.*}} uitofp ; - ; AVX2: uitofpv32i16v32double + ; AVX2-LABEL: uitofpv32i16v32double ; AVX2: cost of 80 {{.*}} uitofp ; - ; AVX512F: uitofpv32i16v32double + ; AVX512F-LABEL: uitofpv32i16v32double ; AVX512F: cost of 88 {{.*}} uitofp %1 = uitofp <32 x i16> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) { - ; SSE2: uitofpv2i32v2double + ; SSE2-LABEL: uitofpv2i32v2double ; SSE2: cost of 20 {{.*}} uitofp ; - ; AVX1: uitofpv2i32v2double + ; AVX1-LABEL: uitofpv2i32v2double ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i32v2double + ; AVX2-LABEL: uitofpv2i32v2double ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i32v2double + ; AVX512F-LABEL: uitofpv2i32v2double ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <2 x i32> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i32v4double(<4 x i32> %a) { - ; SSE2: uitofpv4i32v4double + ; SSE2-LABEL: uitofpv4i32v4double ; SSE2: cost of 40 {{.*}} uitofp ; - ; AVX1: uitofpv4i32v4double + ; AVX1-LABEL: uitofpv4i32v4double ; AVX1: cost of 6 {{.*}} uitofp ; - ; AVX2: uitofpv4i32v4double + ; AVX2-LABEL: uitofpv4i32v4double ; AVX2: cost of 6 {{.*}} uitofp ; - ; AVX512F: uitofpv4i32v4double - ; AVX512F: cost of 6 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv4i32v4double + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <4 x i32> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @uitofpv8i32v8double(<8 x i32> %a) { - ; SSE2: uitofpv8i32v8double + ; SSE2-LABEL: uitofpv8i32v8double ; SSE2: cost of 80 {{.*}} uitofp ; - ; AVX1: uitofpv8i32v8double + ; AVX1-LABEL: uitofpv8i32v8double ; AVX1: cost of 20 {{.*}} uitofp ; - ; AVX2: uitofpv8i32v8double + ; AVX2-LABEL: uitofpv8i32v8double ; AVX2: cost of 20 {{.*}} uitofp ; - ; AVX512F: uitofpv8i32v8double - ; AVX512F: cost of 22 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i32v8double + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <8 x i32> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @uitofpv16i32v16double(<16 x i32> %a) { - ; SSE2: uitofpv16i32v16double + ; SSE2-LABEL: uitofpv16i32v16double ; SSE2: cost of 160 {{.*}} uitofp ; - ; AVX1: uitofpv16i32v16double + ; AVX1-LABEL: uitofpv16i32v16double ; AVX1: cost of 40 {{.*}} uitofp ; - ; AVX2: uitofpv16i32v16double + ; AVX2-LABEL: uitofpv16i32v16double ; AVX2: cost of 40 {{.*}} uitofp ; - ; AVX512F: uitofpv16i32v16double + ; AVX512F-LABEL: uitofpv16i32v16double ; AVX512F: cost of 44 {{.*}} uitofp %1 = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i32v32double(<32 x i32> %a) { - ; SSE2: uitofpv32i32v32double + ; SSE2-LABEL: uitofpv32i32v32double ; SSE2: cost of 320 {{.*}} uitofp ; - ; AVX1: uitofpv32i32v32double + ; AVX1-LABEL: uitofpv32i32v32double ; AVX1: cost of 80 {{.*}} uitofp ; - ; AVX2: uitofpv32i32v32double + ; AVX2-LABEL: uitofpv32i32v32double ; AVX2: cost of 80 {{.*}} uitofp ; - ; AVX512F: uitofpv32i32v32double + ; AVX512F-LABEL: uitofpv32i32v32double ; AVX512F: cost of 88 {{.*}} uitofp %1 = uitofp <32 x i32> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) { - ; SSE2: uitofpv2i64v2double + ; SSE2-LABEL: uitofpv2i64v2double ; SSE2: cost of 20 {{.*}} uitofp ; - ; AVX1: uitofpv2i64v2double + ; AVX1-LABEL: uitofpv2i64v2double ; AVX1: cost of 20 {{.*}} uitofp ; - ; AVX2: uitofpv2i64v2double + ; AVX2-LABEL: uitofpv2i64v2double ; AVX2: cost of 20 {{.*}} uitofp ; - ; AVX512F: uitofpv2i64v2double - ; AVX512F: cost of 20 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv2i64v2double + ; AVX512F: cost of 5 {{.*}} uitofp + ; + ; AVX512DQ: uitofpv2i64v2double + ; AVX512DQ: cost of 1 {{.*}} uitofp %1 = uitofp <2 x i64> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) { - ; SSE2: uitofpv4i64v4double + ; SSE2-LABEL: uitofpv4i64v4double ; SSE2: cost of 40 {{.*}} uitofp ; - ; AVX1: uitofpv4i64v4double + ; AVX1-LABEL: uitofpv4i64v4double ; AVX1: cost of 40 {{.*}} uitofp ; - ; AVX2: uitofpv4i64v4double + ; AVX2-LABEL: uitofpv4i64v4double ; AVX2: cost of 40 {{.*}} uitofp ; - ; AVX512F: uitofpv4i64v4double - ; AVX512F: cost of 40 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv4i64v4double + ; AVX512F: cost of 12 {{.*}} uitofp + ; + ; AVX512DQ: uitofpv4i64v4double + ; AVX512DQ: cost of 1 {{.*}} uitofp %1 = uitofp <4 x i64> %a to <4 x double> ret <4 x double> %1 } define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) { - ; SSE2: uitofpv8i64v8double + ; SSE2-LABEL: uitofpv8i64v8double ; SSE2: cost of 80 {{.*}} uitofp ; - ; AVX1: uitofpv8i64v8double + ; AVX1-LABEL: uitofpv8i64v8double ; AVX1: cost of 20 {{.*}} uitofp ; - ; AVX2: uitofpv8i64v8double + ; AVX2-LABEL: uitofpv8i64v8double ; AVX2: cost of 20 {{.*}} uitofp ; - ; AVX512F: uitofpv8i64v8double - ; AVX512F: cost of 22 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i64v8double + ; AVX512F: cost of 26 {{.*}} uitofp + ; + ; AVX512DQ: uitofpv8i64v8double + ; AVX512DQ: cost of 1 {{.*}} uitofp %1 = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) { - ; SSE2: uitofpv16i64v16double + ; SSE2-LABEL: uitofpv16i64v16double ; SSE2: cost of 160 {{.*}} uitofp ; - ; AVX1: uitofpv16i64v16double + ; AVX1-LABEL: uitofpv16i64v16double ; AVX1: cost of 40 {{.*}} uitofp ; - ; AVX2: uitofpv16i64v16double + ; AVX2-LABEL: uitofpv16i64v16double ; AVX2: cost of 40 {{.*}} uitofp ; - ; AVX512F: uitofpv16i64v16double + ; AVX512F-LABEL: uitofpv16i64v16double ; AVX512F: cost of 44 {{.*}} uitofp + ; + ; AVX512DQ: uitofpv16i64v16double + ; AVX512DQ: cost of 44 {{.*}} uitofp %1 = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) { - ; SSE2: uitofpv32i64v32double + ; SSE2-LABEL: uitofpv32i64v32double ; SSE2: cost of 320 {{.*}} uitofp ; - ; AVX1: uitofpv32i64v32double + ; AVX1-LABEL: uitofpv32i64v32double ; AVX1: cost of 80 {{.*}} uitofp ; - ; AVX2: uitofpv32i64v32double + ; AVX2-LABEL: uitofpv32i64v32double ; AVX2: cost of 80 {{.*}} uitofp ; - ; AVX512F: uitofpv32i64v32double + ; AVX512F-LABEL: uitofpv32i64v32double ; AVX512F: cost of 88 {{.*}} uitofp + ; + ; AVX512DQ: uitofpv32i64v32double + ; AVX512DQ: cost of 88 {{.*}} uitofp %1 = uitofp <32 x i64> %a to <32 x double> ret <32 x double> %1 } define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) { - ; SSE2: uitofpv2i8v2float + ; SSE2-LABEL: uitofpv2i8v2float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv2i8v2float + ; AVX1-LABEL: uitofpv2i8v2float ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i8v2float + ; AVX2-LABEL: uitofpv2i8v2float ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i8v2float + ; AVX512F-LABEL: uitofpv2i8v2float ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) { - ; SSE2: uitofpv4i8v4float + ; SSE2-LABEL: uitofpv4i8v4float ; SSE2: cost of 8 {{.*}} uitofp ; - ; AVX1: uitofpv4i8v4float + ; AVX1-LABEL: uitofpv4i8v4float ; AVX1: cost of 2 {{.*}} uitofp ; - ; AVX2: uitofpv4i8v4float + ; AVX2-LABEL: uitofpv4i8v4float ; AVX2: cost of 2 {{.*}} uitofp ; - ; AVX512F: uitofpv4i8v4float + ; AVX512F-LABEL: uitofpv4i8v4float ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <4 x i8> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @uitofpv8i8v8float(<8 x i8> %a) { - ; SSE2: uitofpv8i8v8float + ; SSE2-LABEL: uitofpv8i8v8float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv8i8v8float + ; AVX1-LABEL: uitofpv8i8v8float ; AVX1: cost of 5 {{.*}} uitofp ; - ; AVX2: uitofpv8i8v8float + ; AVX2-LABEL: uitofpv8i8v8float ; AVX2: cost of 5 {{.*}} uitofp ; - ; AVX512F: uitofpv8i8v8float - ; AVX512F: cost of 5 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i8v8float + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <8 x i8> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @uitofpv16i8v16float(<16 x i8> %a) { - ; SSE2: uitofpv16i8v16float + ; SSE2-LABEL: uitofpv16i8v16float ; SSE2: cost of 8 {{.*}} uitofp ; - ; AVX1: uitofpv16i8v16float + ; AVX1-LABEL: uitofpv16i8v16float ; AVX1: cost of 44 {{.*}} uitofp ; - ; AVX2: uitofpv16i8v16float + ; AVX2-LABEL: uitofpv16i8v16float ; AVX2: cost of 44 {{.*}} uitofp ; - ; AVX512F: uitofpv16i8v16float - ; AVX512F: cost of 46 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv16i8v16float + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @uitofpv32i8v32float(<32 x i8> %a) { - ; SSE2: uitofpv32i8v32float + ; SSE2-LABEL: uitofpv32i8v32float ; SSE2: cost of 16 {{.*}} uitofp ; - ; AVX1: uitofpv32i8v32float + ; AVX1-LABEL: uitofpv32i8v32float ; AVX1: cost of 88 {{.*}} uitofp ; - ; AVX2: uitofpv32i8v32float + ; AVX2-LABEL: uitofpv32i8v32float ; AVX2: cost of 88 {{.*}} uitofp ; - ; AVX512F: uitofpv32i8v32float + ; AVX512F-LABEL: uitofpv32i8v32float ; AVX512F: cost of 92 {{.*}} uitofp %1 = uitofp <32 x i8> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) { - ; SSE2: uitofpv2i16v2float + ; SSE2-LABEL: uitofpv2i16v2float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv2i16v2float + ; AVX1-LABEL: uitofpv2i16v2float ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i16v2float + ; AVX2-LABEL: uitofpv2i16v2float ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i16v2float + ; AVX512F-LABEL: uitofpv2i16v2float ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <2 x i16> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) { - ; SSE2: uitofpv4i16v4float + ; SSE2-LABEL: uitofpv4i16v4float ; SSE2: cost of 8 {{.*}} uitofp ; - ; AVX1: uitofpv4i16v4float + ; AVX1-LABEL: uitofpv4i16v4float ; AVX1: cost of 2 {{.*}} uitofp ; - ; AVX2: uitofpv4i16v4float + ; AVX2-LABEL: uitofpv4i16v4float ; AVX2: cost of 2 {{.*}} uitofp ; - ; AVX512F: uitofpv4i16v4float + ; AVX512F-LABEL: uitofpv4i16v4float ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <4 x i16> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @uitofpv8i16v8float(<8 x i16> %a) { - ; SSE2: uitofpv8i16v8float + ; SSE2-LABEL: uitofpv8i16v8float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv8i16v8float + ; AVX1-LABEL: uitofpv8i16v8float ; AVX1: cost of 5 {{.*}} uitofp ; - ; AVX2: uitofpv8i16v8float + ; AVX2-LABEL: uitofpv8i16v8float ; AVX2: cost of 5 {{.*}} uitofp ; - ; AVX512F: uitofpv8i16v8float - ; AVX512F: cost of 5 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i16v8float + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <8 x i16> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @uitofpv16i16v16float(<16 x i16> %a) { - ; SSE2: uitofpv16i16v16float + ; SSE2-LABEL: uitofpv16i16v16float ; SSE2: cost of 30 {{.*}} uitofp ; - ; AVX1: uitofpv16i16v16float + ; AVX1-LABEL: uitofpv16i16v16float ; AVX1: cost of 44 {{.*}} uitofp ; - ; AVX2: uitofpv16i16v16float + ; AVX2-LABEL: uitofpv16i16v16float ; AVX2: cost of 44 {{.*}} uitofp ; - ; AVX512F: uitofpv16i16v16float - ; AVX512F: cost of 46 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv16i16v16float + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @uitofpv32i16v32float(<32 x i16> %a) { - ; SSE2: uitofpv32i16v32float + ; SSE2-LABEL: uitofpv32i16v32float ; SSE2: cost of 60 {{.*}} uitofp ; - ; AVX1: uitofpv32i16v32float + ; AVX1-LABEL: uitofpv32i16v32float ; AVX1: cost of 88 {{.*}} uitofp ; - ; AVX2: uitofpv32i16v32float + ; AVX2-LABEL: uitofpv32i16v32float ; AVX2: cost of 88 {{.*}} uitofp ; - ; AVX512F: uitofpv32i16v32float + ; AVX512F-LABEL: uitofpv32i16v32float ; AVX512F: cost of 92 {{.*}} uitofp %1 = uitofp <32 x i16> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) { - ; SSE2: uitofpv2i32v2float + ; SSE2-LABEL: uitofpv2i32v2float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv2i32v2float + ; AVX1-LABEL: uitofpv2i32v2float ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i32v2float + ; AVX2-LABEL: uitofpv2i32v2float ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i32v2float - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv2i32v2float + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) { - ; SSE2: uitofpv4i32v4float + ; SSE2-LABEL: uitofpv4i32v4float ; SSE2: cost of 8 {{.*}} uitofp ; - ; AVX1: uitofpv4i32v4float + ; AVX1-LABEL: uitofpv4i32v4float ; AVX1: cost of 6 {{.*}} uitofp ; - ; AVX2: uitofpv4i32v4float + ; AVX2-LABEL: uitofpv4i32v4float ; AVX2: cost of 6 {{.*}} uitofp ; - ; AVX512F: uitofpv4i32v4float - ; AVX512F: cost of 6 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv4i32v4float + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) { - ; SSE2: uitofpv8i32v8float + ; SSE2-LABEL: uitofpv8i32v8float ; SSE2: cost of 16 {{.*}} uitofp ; - ; AVX1: uitofpv8i32v8float + ; AVX1-LABEL: uitofpv8i32v8float ; AVX1: cost of 9 {{.*}} uitofp ; - ; AVX2: uitofpv8i32v8float + ; AVX2-LABEL: uitofpv8i32v8float ; AVX2: cost of 8 {{.*}} uitofp ; - ; AVX512F: uitofpv8i32v8float - ; AVX512F: cost of 8 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv8i32v8float + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) { - ; SSE2: uitofpv16i32v16float + ; SSE2-LABEL: uitofpv16i32v16float ; SSE2: cost of 32 {{.*}} uitofp ; - ; AVX1: uitofpv16i32v16float + ; AVX1-LABEL: uitofpv16i32v16float ; AVX1: cost of 44 {{.*}} uitofp ; - ; AVX2: uitofpv16i32v16float + ; AVX2-LABEL: uitofpv16i32v16float ; AVX2: cost of 44 {{.*}} uitofp ; - ; AVX512F: uitofpv16i32v16float - ; AVX512F: cost of 46 {{.*}} uitofp + ; AVX512F-LABEL: uitofpv16i32v16float + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) { - ; SSE2: uitofpv32i32v32float + ; SSE2-LABEL: uitofpv32i32v32float ; SSE2: cost of 64 {{.*}} uitofp ; - ; AVX1: uitofpv32i32v32float + ; AVX1-LABEL: uitofpv32i32v32float ; AVX1: cost of 88 {{.*}} uitofp ; - ; AVX2: uitofpv32i32v32float + ; AVX2-LABEL: uitofpv32i32v32float ; AVX2: cost of 88 {{.*}} uitofp ; - ; AVX512F: uitofpv32i32v32float + ; AVX512F-LABEL: uitofpv32i32v32float ; AVX512F: cost of 92 {{.*}} uitofp %1 = uitofp <32 x i32> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) { - ; SSE2: uitofpv2i64v2float + ; SSE2-LABEL: uitofpv2i64v2float ; SSE2: cost of 15 {{.*}} uitofp ; - ; AVX1: uitofpv2i64v2float + ; AVX1-LABEL: uitofpv2i64v2float ; AVX1: cost of 4 {{.*}} uitofp ; - ; AVX2: uitofpv2i64v2float + ; AVX2-LABEL: uitofpv2i64v2float ; AVX2: cost of 4 {{.*}} uitofp ; - ; AVX512F: uitofpv2i64v2float + ; AVX512F-LABEL: uitofpv2i64v2float ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <2 x i64> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i64v4float(<4 x i64> %a) { - ; SSE2: uitofpv4i64v4float + ; SSE2-LABEL: uitofpv4i64v4float ; SSE2: cost of 30 {{.*}} uitofp ; - ; AVX1: uitofpv4i64v4float + ; AVX1-LABEL: uitofpv4i64v4float ; AVX1: cost of 10 {{.*}} uitofp ; - ; AVX2: uitofpv4i64v4float + ; AVX2-LABEL: uitofpv4i64v4float ; AVX2: cost of 10 {{.*}} uitofp ; - ; AVX512F: uitofpv4i64v4float + ; AVX512F-LABEL: uitofpv4i64v4float ; AVX512F: cost of 10 {{.*}} uitofp %1 = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) { - ; SSE2: uitofpv8i64v8float + ; SSE2-LABEL: uitofpv8i64v8float ; SSE2: cost of 60 {{.*}} uitofp ; - ; AVX1: uitofpv8i64v8float + ; AVX1-LABEL: uitofpv8i64v8float ; AVX1: cost of 22 {{.*}} uitofp ; - ; AVX2: uitofpv8i64v8float + ; AVX2-LABEL: uitofpv8i64v8float ; AVX2: cost of 22 {{.*}} uitofp ; - ; AVX512F: uitofpv8i64v8float + ; AVX512F-LABEL: uitofpv8i64v8float ; AVX512F: cost of 22 {{.*}} uitofp %1 = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) { - ; SSE2: uitofpv16i64v16float + ; SSE2-LABEL: uitofpv16i64v16float ; SSE2: cost of 120 {{.*}} uitofp ; - ; AVX1: uitofpv16i64v16float + ; AVX1-LABEL: uitofpv16i64v16float ; AVX1: cost of 44 {{.*}} uitofp ; - ; AVX2: uitofpv16i64v16float + ; AVX2-LABEL: uitofpv16i64v16float ; AVX2: cost of 44 {{.*}} uitofp ; - ; AVX512F: uitofpv16i64v16float + ; AVX512F-LABEL: uitofpv16i64v16float ; AVX512F: cost of 46 {{.*}} uitofp %1 = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) { - ; SSE2: uitofpv32i64v32float + ; SSE2-LABEL: uitofpv32i64v32float ; SSE2: cost of 240 {{.*}} uitofp ; - ; AVX1: uitofpv32i64v32float + ; AVX1-LABEL: uitofpv32i64v32float ; AVX1: cost of 88 {{.*}} uitofp ; - ; AVX2: uitofpv32i64v32float + ; AVX2-LABEL: uitofpv32i64v32float ; AVX2: cost of 88 {{.*}} uitofp ; - ; AVX512F: uitofpv32i64v32float + ; AVX512F-LABEL: uitofpv32i64v32float ; AVX512F: cost of 92 {{.*}} uitofp %1 = uitofp <32 x i64> %a to <32 x float> ret <32 x float> %1 } +define <8 x i32> @fptouiv8f32v8i32(<8 x float> %a) { + ; AVX512F-LABEL: fptouiv8f32v8i32 + ; AVX512F: cost of 1 {{.*}} fptoui + %1 = fptoui <8 x float> %a to <8 x i32> + ret <8 x i32> %1 +} + +define <4 x i32> @fptouiv4f32v4i32(<4 x float> %a) { + ; AVX512F-LABEL: fptouiv4f32v4i32 + ; AVX512F: cost of 1 {{.*}} fptoui + %1 = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i32> @fptouiv2f32v2i32(<2 x float> %a) { + ; AVX512F-LABEL: fptouiv2f32v2i32 + ; AVX512F: cost of 1 {{.*}} fptoui + %1 = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %1 +} + +define <16 x i32> @fptouiv16f32v16i32(<16 x float> %a) { + ; AVX512F-LABEL: fptouiv16f32v16i32 + ; AVX512F: cost of 1 {{.*}} fptoui + %1 = fptoui <16 x float> %a to <16 x i32> + ret <16 x i32> %1 +} + +define <8 x i64> @fptouiv8f32v8i64(<8 x float> %a) { + ; AVX512DQ-LABEL: fptouiv8f32v8i64 + ; AVX512DQ: cost of 1 {{.*}} fptoui + %1 = fptoui <8 x float> %a to <8 x i64> + ret <8 x i64> %1 +} + +define <4 x i64> @fptouiv4f32v4i64(<4 x float> %a) { + ; AVX512DQ-LABEL: fptouiv4f32v4i64 + ; AVX512DQ: cost of 1 {{.*}} fptoui + %1 = fptoui <4 x float> %a to <4 x i64> + ret <4 x i64> %1 +} + +define <2 x i64> @fptouiv2f32v2i64(<2 x float> %a) { + ; AVX512DQ-LABEL: fptouiv2f32v2i64 + ; AVX512DQ: cost of 1 {{.*}} fptoui + %1 = fptoui <2 x float> %a to <2 x i64> + ret <2 x i64> %1 +} diff --git a/test/Analysis/CostModel/X86/vector_gep.ll b/test/Analysis/CostModel/X86/vector_gep.ll new file mode 100644 index 000000000000..e49f25871d66 --- /dev/null +++ b/test/Analysis/CostModel/X86/vector_gep.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-linux-unknown-unknown -mattr=+avx512f | FileCheck %s + +%struct.S = type { [1000 x i32] } + + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) + +define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){ + %temp = insertelement <4 x i64> undef, i64 %base, i32 0 + %vector = shufflevector <4 x i64> %temp, <4 x i64> undef, <4 x i32> zeroinitializer +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds %struct.S + %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32] + %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9bee504efece..138450ba8e02 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -39,6 +39,7 @@ set(LLVM_TEST_DEPENDS llvm-dis llvm-dsymutil llvm-dwarfdump + llvm-dwp llvm-extract llvm-lib llvm-link diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll index a76cf74a6d0c..44c24c51f0df 100644 --- a/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) { ret i128 %r } -define i128 @atomic_load_relaxed(i128* %p) { +define i128 @atomic_load_relaxed(i64, i64, i128* %p) { ; CHECK-LABEL: atomic_load_relaxed: ; CHECK-NOT: dmb -; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2] +; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2] +; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb %r = load atomic i128, i128* %p monotonic, align 16 ret i128 %r diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll index d5baf16bdd5c..ad89d3ff711b 100644 --- a/test/CodeGen/AArch64/arm64-long-shift.ll +++ b/test/CodeGen/AArch64/arm64-long-shift.ll @@ -2,18 +2,20 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: shl: -; CHECK: lsl [[XREG_0:x[0-9]+]], x1, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsr [[XREG_3:x[0-9]+]], x0, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]] -; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64 -; CHECK-NEXT: lsl [[XREG_5:x[0-9]+]], x0, [[XREG_4]] -; CHECK-NEXT: cmp [[XREG_4]], #0 -; CHECK-NEXT: csel x1, [[XREG_5]], [[XREG_6]], ge -; CHECK-NEXT: lsl [[SMALLSHIFT_LO:x[0-9]+]], x0, x2 -; CHECK-NEXT: csel x0, xzr, [[SMALLSHIFT_LO]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsr [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq +; CHECK: lsl [[HI_FOR_HI:x[0-9]+]], x1, x2 +; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: lsl [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge +; CHECK: lsl [[SMALLSHIFT_LO:x[0-9]+]], x0, x2 +; CHECK: csel x0, xzr, [[SMALLSHIFT_LO]], ge +; CHECK: ret %shl = shl i128 %r, %s ret i128 %shl @@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone { define i128 @ashr(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: ashr: -; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]] -; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64 -; CHECK-NEXT: asr [[XREG_6:x[0-9]+]], x1, [[XREG_5]] -; CHECK-NEXT: cmp [[XREG_5]], #0 -; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge -; CHECK-NEXT: asr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 -; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63 -; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsl [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq +; CHECK: lsr [[LO_FOR_LO:x[0-9]+]], x0, x2 +; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: asr [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge +; CHECK: asr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 +; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63 +; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge +; CHECK: ret %shr = ashr i128 %r, %s ret i128 %shr @@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone { define i128 @lshr(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: lshr: -; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]] -; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64 -; CHECK-NEXT: lsr [[XREG_6:x[0-9]+]], x1, [[XREG_5]] -; CHECK-NEXT: cmp [[XREG_5]], #0 -; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge -; CHECK-NEXT: lsr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 -; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsl [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq +; CHECK: lsr [[LO_FOR_LO:x[0-9]+]], x0, x2 +; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: lsr [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge +; CHECK: lsr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 +; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge +; CHECK: ret %shr = lshr i128 %r, %s ret i128 %shr diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 9df51dcc4478..509b547a5c82 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -215,3 +215,25 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) { ret void } + +; Tests when all the bits from one operand are not useful +define i32 @test_nouseful_bits(i8 %a, i32 %b) { +; CHECK-LABEL: test_nouseful_bits: +; CHECK: bfi +; CHECK: bfi +; CHECK: bfi +; CHECK-NOT: bfi +; CHECK-NOT: or +; CHECK: lsl + %conv = zext i8 %a to i32 ; 0 0 0 A + %shl = shl i32 %b, 8 ; B2 B1 B0 0 + %or = or i32 %conv, %shl ; B2 B1 B0 A + %shl.1 = shl i32 %or, 8 ; B1 B0 A 0 + %or.1 = or i32 %conv, %shl.1 ; B1 B0 A A + %shl.2 = shl i32 %or.1, 8 ; B0 A A 0 + %or.2 = or i32 %conv, %shl.2 ; B0 A A A + %shl.3 = shl i32 %or.2, 8 ; A A A 0 + %or.3 = or i32 %conv, %shl.3 ; A A A A + %shl.4 = shl i32 %or.3, 8 ; A A A 0 + ret i32 %shl.4 +} diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll index 1266842fcc6d..a8399f92ebe4 100644 --- a/test/CodeGen/AArch64/cpus.ll +++ b/test/CodeGen/AArch64/cpus.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll index 8b3e6dd5ad92..a397c339a2d7 100644 --- a/test/CodeGen/AArch64/remat.ll +++ b/test/CodeGen/AArch64/remat.ll @@ -1,3 +1,4 @@ +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll new file mode 100644 index 000000000000..9be212feef00 --- /dev/null +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -0,0 +1,66 @@ +; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: unsupported addrspacecast not implemented + +; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + +; CHECK-LABEL: {{^}}branch_use_flat_i32: +; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: s_endpgm +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32, i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; TODO: This should not be zero when registers are used for small +; scratch allocations again. + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: {{^}}store_flat_scratch: +; CHECK: s_movk_i32 flat_scratch_lo, 0 +; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} +; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} +; CHECK: flat_store_dword +; CHECK: s_barrier +; CHECK: flat_load_dword +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { + %alloca = alloca i32, i32 9, align 4 + %x = call i32 @llvm.r600.read.tidig.x() #3 + %pptr = getelementptr i32, i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 +declare i32 @llvm.r600.read.tidig.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind noduplicate } +attributes #3 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 4d70ba837816..1c5bed3b905f 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -30,3 +30,69 @@ endif: done: ret void } + +; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT-CI-NOT: getelementptr +; OPT: br i1 + +; OPT-CI: ptrtoint +; OPT-CI: add +; OPT-CI: inttoptr +; OPT: br label + +; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32: +; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 +define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)* + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %cast + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT-CI-NOT: getelementptr +; OPT: br i1 + +; OPT-CI: ptrtoint +; OPT-CI: add +; OPT-CI: inttoptr +; OPT: br label + +; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32: +; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)* + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %cast + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll index 571685ca6aeb..4b56d6f19832 100644 --- a/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/test/CodeGen/AMDGPU/flat-address-space.ll @@ -7,32 +7,6 @@ ; specialize away generic pointer accesses. -; CHECK-LABEL: {{^}}branch_use_flat_i32: -; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; CHECK: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { -entry: - %cmp = icmp ne i32 %c, 0 - br i1 %cmp, label %local, label %global - -local: - %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* - br label %end - -global: - %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - br label %end - -end: - %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store i32 %x, i32 addrspace(4)* %fptr, align 4 -; %val = load i32, i32 addrspace(4)* %fptr, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - - - ; These testcases might become useless when there are optimizations to ; remove generic pointers. @@ -150,32 +124,6 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ret void } - - -; TODO: This should not be zero when registers are used for small -; scratch allocations again. - -; Check for prologue initializing special SGPRs pointing to scratch. -; CHECK-LABEL: {{^}}store_flat_scratch: -; CHECK: s_movk_i32 flat_scratch_lo, 0 -; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} -; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} -; CHECK: flat_store_dword -; CHECK: s_barrier -; CHECK: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { - %alloca = alloca i32, i32 9, align 4 - %x = call i32 @llvm.r600.read.tidig.x() #3 - %pptr = getelementptr i32, i32* %alloca, i32 %x - %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr - ; Dummy call - call void @llvm.AMDGPU.barrier.local() #1 - %reload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %reload, i32 addrspace(1)* %out, align 4 - ret void -} - declare void @llvm.AMDGPU.barrier.local() #1 declare i32 @llvm.r600.read.tidig.x() #3 diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll new file mode 100644 index 000000000000..b6483f0970b0 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-globals.ll @@ -0,0 +1,141 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF + +@internal_global_program = internal addrspace(1) global i32 0 +@common_global_program = common addrspace(1) global i32 0 +@external_global_program = addrspace(1) global i32 0 + +@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent" +@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent" +@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent" + +@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0 +@external_readonly = unnamed_addr addrspace(2) constant i32 0 + +define void @test() { + ret void +} + +; ASM: .amdgpu_hsa_module_global internal_global +; ASM: .hsadata_global_program +; ASM: internal_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global common_global +; ASM: .hsadata_global_program +; ASM: common_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_global +; ASM: .hsadata_global_program +; ASM: external_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global internal_global +; ASM: .hsadata_global_agent +; ASM: internal_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global common_global +; ASM: .hsadata_global_agent +; ASM: common_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_global +; ASM: .hsadata_global_agent +; ASM: external_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global internal_readonly +; ASM: .hsarodata_readonly_agent +; ASM: internal_readonly: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_readonly +; ASM: .hsarodata_readonly_agent +; ASM: external_readonly: +; ASM: .long 0 + +; ELF: Section { +; ELF: Name: .hsadata_global_program +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0x100003) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000) +; ELF: SHF_WRITE (0x1) +; ELF: ] +; ELF: } + +; ELF: Section { +; ELF: Name: .hsadata_global_agent +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0x900003) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_AGENT (0x800000) +; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000) +; ELF: SHF_WRITE (0x1) +; ELF: ] +; ELF: } + +; ELF: Section { +; ELF: Name: .hsarodata_readonly_agent +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0xA00002) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_AGENT (0x800000) +; ELF: SHF_AMDGPU_HSA_READONLY (0x200000) +; ELF: ] + +; ELF: Symbol { +; ELF: Name: common_global_agent +; ELF: Binding: Local +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: common_global_program +; ELF: Binding: Local +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_global_agent +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_global_program +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_readonly +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsarodata_readonly_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_global_agent +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_global_program +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_readonly +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsarodata_readonly_agent +; ELF: } diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll new file mode 100644 index 000000000000..1999dc38a6b0 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s + +@internal_group = internal addrspace(3) global i32 undef +@external_group = addrspace(3) global i32 undef + +define void @test() { +entry: + store i32 0, i32 addrspace(3)* @internal_group + store i32 0, i32 addrspace(3)* @external_group + ret void +} + +; HSA-NOT: internal_group: +; HSA-NOT: external_group: diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index ab87fdbc00da..d9bb586163dc 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -38,8 +38,10 @@ ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 803b2ecced01..e9d98ac89e72 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -294,8 +294,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 @@ -311,7 +311,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 @@ -413,8 +413,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 @@ -438,8 +438,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll new file mode 100644 index 000000000000..c348a2e7980f --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s + +; FIXME: align on alloca seems to be ignored for private_segment_alignment + +; ALL-LABEL: {{^}}large_alloca_compute_shader: + +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + + +; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 0 +; GCNHSA: private_segment_alignment = 4 +; GCNHSA: .end_amd_kernel_code_t + + +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen + +; Scratch size = alloca size + emergency stack slot +; ALL: ; ScratchSize: 32772 +define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll new file mode 100644 index 000000000000..141ee2560152 --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s + +; ALL-LABEL: {{^}}large_alloca_pixel_shader: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll deleted file mode 100644 index e1122da78ef5..000000000000 --- a/test/CodeGen/AMDGPU/large-alloca.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}large_alloca: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: ScratchSize: 32776 -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %load = load i32, i32* %gep1 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll new file mode 100644 index 000000000000..6dc9d050eee6 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 719f7ffe0f1c..dc95cd1ee012 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll index c5aba2b76b89..cc109327d929 100644 --- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll new file mode 100644 index 000000000000..f2a7256e812d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -0,0 +1,184 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 + +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz(i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll index 7f31ef45b628..6b52b80ba082 100644 --- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll index 9494ed75bd0c..9ffb59e70920 100644 --- a/test/CodeGen/AMDGPU/local-memory.ll +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll index 0e15bc878650..27a8e70aae13 100644 --- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -3,7 +3,7 @@ ; register operands in the correct order when modifying the opcode of an ; instruction to V_ADD_I32_e32. -; CHECK: %19 = V_ADD_I32_e32 %13, %12, implicit-def %vcc, implicit %exec +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 84652701f773..d7b35fc631eb 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -6,6 +6,16 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: s_wqm + +; Make sure not emitting unused scratch resource descriptor setup +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 + +; CHECK: s_mov_b32 m0 + + ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll new file mode 100644 index 000000000000..cd7c78f408dd --- /dev/null +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -0,0 +1,585 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s +; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s + +; This ends up using all 256 registers and requires register +; scavenging which will fail to find an unsued register. + +; Check the ScratchSize to avoid regressions from spilling +; intermediate register class copies. + +; FIXME: The same register is initialized to 0 for every spill. + +declare i32 @llvm.r600.read.tgid.x() #1 +declare i32 @llvm.r600.read.tgid.y() #1 +declare i32 @llvm.r600.read.tgid.z() #1 + +; GCN-LABEL: {{^}}spill_vgpr_compute: + +; GCN: s_mov_b32 s16, s3 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + + +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 + +; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. +define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +bb: + %tmp = add i32 %arg1, %arg2 + %tmp7 = extractelement <4 x float> %arg6, i32 0 + %tmp8 = extractelement <4 x float> %arg6, i32 1 + %tmp9 = extractelement <4 x float> %arg6, i32 2 + %tmp10 = extractelement <4 x float> %arg6, i32 3 + %tmp11 = bitcast float %arg5 to i32 + br label %bb12 + +bb12: ; preds = %bb145, %bb + %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ] + %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ] + %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ] + %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ] + %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ] + %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ] + %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ] + %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ] + %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ] + %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ] + %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ] + %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ] + %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ] + %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ] + %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ] + %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ] + %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ] + %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ] + %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ] + %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ] + %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ] + %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ] + %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ] + %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ] + %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ] + %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ] + %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ] + %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ] + %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ] + %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ] + %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ] + %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ] + %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ] + %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ] + %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ] + %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ] + %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ] + %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ] + %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ] + %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ] + %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ] + %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ] + %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ] + %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ] + %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ] + %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ] + %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ] + %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ] + %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ] + %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ] + %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ] + %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ] + %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ] + %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ] + %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ] + %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ] + %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ] + %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ] + %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ] + %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ] + %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ] + %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ] + %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ] + %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ] + %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ] + %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ] + %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ] + %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ] + %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ] + %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ] + %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ] + %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ] + %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ] + %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ] + %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ] + %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ] + %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ] + %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ] + %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ] + %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ] + %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ] + %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ] + %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ] + %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ] + %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ] + %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ] + %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ] + %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ] + %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ] + %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ] + %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ] + %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ] + %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ] + %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ] + %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ] + %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ] + %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ] + %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ] + %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ] + %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ] + %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ] + %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ] + %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ] + %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ] + %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ] + %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ] + %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ] + %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ] + %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ] + %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ] + %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ] + %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ] + %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ] + %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ] + %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ] + %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ] + %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ] + %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ] + %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ] + %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ] + %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ] + %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ] + %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ] + %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ] + %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ] + %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ] + %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ] + %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] + %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] + %tmp142 = bitcast float %tmp95 to i32 + %tmp143 = icmp sgt i32 %tmp142, 125 + br i1 %tmp143, label %bb144, label %bb145 + +bb144: ; preds = %bb12 + store volatile float %arg3, float addrspace(1)* %arg + store volatile float %tmp91, float addrspace(1)* %arg + store volatile float %tmp90, float addrspace(1)* %arg + store volatile float %tmp89, float addrspace(1)* %arg + store volatile float %tmp87, float addrspace(1)* %arg + store volatile float %tmp86, float addrspace(1)* %arg + store volatile float %tmp85, float addrspace(1)* %arg + store volatile float %tmp83, float addrspace(1)* %arg + store volatile float %tmp82, float addrspace(1)* %arg + store volatile float %tmp81, float addrspace(1)* %arg + store volatile float %tmp79, float addrspace(1)* %arg + store volatile float %tmp78, float addrspace(1)* %arg + store volatile float %tmp77, float addrspace(1)* %arg + store volatile float %tmp75, float addrspace(1)* %arg + store volatile float %tmp74, float addrspace(1)* %arg + store volatile float %tmp73, float addrspace(1)* %arg + store volatile float %tmp71, float addrspace(1)* %arg + store volatile float %tmp70, float addrspace(1)* %arg + store volatile float %tmp69, float addrspace(1)* %arg + store volatile float %tmp67, float addrspace(1)* %arg + store volatile float %tmp66, float addrspace(1)* %arg + store volatile float %tmp65, float addrspace(1)* %arg + store volatile float %tmp63, float addrspace(1)* %arg + store volatile float %tmp62, float addrspace(1)* %arg + store volatile float %tmp61, float addrspace(1)* %arg + store volatile float %tmp59, float addrspace(1)* %arg + store volatile float %tmp58, float addrspace(1)* %arg + store volatile float %tmp57, float addrspace(1)* %arg + store volatile float %tmp55, float addrspace(1)* %arg + store volatile float %tmp54, float addrspace(1)* %arg + store volatile float %tmp53, float addrspace(1)* %arg + store volatile float %tmp51, float addrspace(1)* %arg + store volatile float %tmp50, float addrspace(1)* %arg + store volatile float %tmp49, float addrspace(1)* %arg + store volatile float %tmp47, float addrspace(1)* %arg + store volatile float %tmp46, float addrspace(1)* %arg + store volatile float %tmp45, float addrspace(1)* %arg + store volatile float %tmp43, float addrspace(1)* %arg + store volatile float %tmp42, float addrspace(1)* %arg + store volatile float %tmp41, float addrspace(1)* %arg + store volatile float %tmp39, float addrspace(1)* %arg + store volatile float %tmp38, float addrspace(1)* %arg + store volatile float %tmp37, float addrspace(1)* %arg + store volatile float %tmp35, float addrspace(1)* %arg + store volatile float %tmp34, float addrspace(1)* %arg + store volatile float %tmp33, float addrspace(1)* %arg + store volatile float %tmp31, float addrspace(1)* %arg + store volatile float %tmp30, float addrspace(1)* %arg + store volatile float %tmp29, float addrspace(1)* %arg + store volatile float %tmp27, float addrspace(1)* %arg + store volatile float %tmp26, float addrspace(1)* %arg + store volatile float %tmp25, float addrspace(1)* %arg + store volatile float %tmp23, float addrspace(1)* %arg + store volatile float %tmp22, float addrspace(1)* %arg + store volatile float %tmp21, float addrspace(1)* %arg + store volatile float %tmp19, float addrspace(1)* %arg + store volatile float %tmp18, float addrspace(1)* %arg + store volatile float %tmp17, float addrspace(1)* %arg + store volatile float %tmp15, float addrspace(1)* %arg + store volatile float %tmp14, float addrspace(1)* %arg + store volatile float %tmp13, float addrspace(1)* %arg + store volatile float %tmp16, float addrspace(1)* %arg + store volatile float %tmp20, float addrspace(1)* %arg + store volatile float %tmp24, float addrspace(1)* %arg + store volatile float %tmp28, float addrspace(1)* %arg + store volatile float %tmp32, float addrspace(1)* %arg + store volatile float %tmp36, float addrspace(1)* %arg + store volatile float %tmp40, float addrspace(1)* %arg + store volatile float %tmp44, float addrspace(1)* %arg + store volatile float %tmp48, float addrspace(1)* %arg + store volatile float %tmp52, float addrspace(1)* %arg + store volatile float %tmp56, float addrspace(1)* %arg + store volatile float %tmp60, float addrspace(1)* %arg + store volatile float %tmp64, float addrspace(1)* %arg + store volatile float %tmp68, float addrspace(1)* %arg + store volatile float %tmp72, float addrspace(1)* %arg + store volatile float %tmp76, float addrspace(1)* %arg + store volatile float %tmp80, float addrspace(1)* %arg + store volatile float %tmp84, float addrspace(1)* %arg + store volatile float %tmp88, float addrspace(1)* %arg + store volatile float %tmp92, float addrspace(1)* %arg + store volatile float %tmp93, float addrspace(1)* %arg + store volatile float %tmp94, float addrspace(1)* %arg + store volatile float %tmp96, float addrspace(1)* %arg + store volatile float %tmp97, float addrspace(1)* %arg + store volatile float %tmp98, float addrspace(1)* %arg + store volatile float %tmp99, float addrspace(1)* %arg + store volatile float %tmp100, float addrspace(1)* %arg + store volatile float %tmp101, float addrspace(1)* %arg + store volatile float %tmp102, float addrspace(1)* %arg + store volatile float %tmp103, float addrspace(1)* %arg + store volatile float %tmp104, float addrspace(1)* %arg + store volatile float %tmp105, float addrspace(1)* %arg + store volatile float %tmp106, float addrspace(1)* %arg + store volatile float %tmp107, float addrspace(1)* %arg + store volatile float %tmp108, float addrspace(1)* %arg + store volatile float %tmp109, float addrspace(1)* %arg + store volatile float %tmp110, float addrspace(1)* %arg + store volatile float %tmp111, float addrspace(1)* %arg + store volatile float %tmp112, float addrspace(1)* %arg + store volatile float %tmp113, float addrspace(1)* %arg + store volatile float %tmp114, float addrspace(1)* %arg + store volatile float %tmp115, float addrspace(1)* %arg + store volatile float %tmp116, float addrspace(1)* %arg + store volatile float %tmp117, float addrspace(1)* %arg + store volatile float %tmp118, float addrspace(1)* %arg + store volatile float %tmp119, float addrspace(1)* %arg + store volatile float %tmp120, float addrspace(1)* %arg + store volatile float %tmp121, float addrspace(1)* %arg + store volatile float %tmp122, float addrspace(1)* %arg + store volatile float %tmp123, float addrspace(1)* %arg + store volatile float %tmp124, float addrspace(1)* %arg + store volatile float %tmp125, float addrspace(1)* %arg + store volatile float %tmp126, float addrspace(1)* %arg + store volatile float %tmp127, float addrspace(1)* %arg + store volatile float %tmp128, float addrspace(1)* %arg + store volatile float %tmp129, float addrspace(1)* %arg + store volatile float %tmp130, float addrspace(1)* %arg + store volatile float %tmp131, float addrspace(1)* %arg + store volatile float %tmp132, float addrspace(1)* %arg + store volatile float %tmp133, float addrspace(1)* %arg + store volatile float %tmp134, float addrspace(1)* %arg + store volatile float %tmp135, float addrspace(1)* %arg + store volatile float %tmp136, float addrspace(1)* %arg + store volatile float %tmp137, float addrspace(1)* %arg + store volatile float %tmp138, float addrspace(1)* %arg + store volatile float %tmp139, float addrspace(1)* %arg + store volatile float %arg4, float addrspace(1)* %arg + store volatile float %tmp7, float addrspace(1)* %arg + store volatile float %tmp8, float addrspace(1)* %arg + store volatile float %tmp9, float addrspace(1)* %arg + store volatile float %tmp10, float addrspace(1)* %arg + ret void + +bb145: ; preds = %bb12 + %tmp146 = bitcast float %tmp95 to i32 + %tmp147 = bitcast float %tmp95 to i32 + %tmp148 = add i32 %tmp11, %tmp147 + %tmp149 = bitcast i32 %tmp148 to float + %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0 + %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1 + %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2 + %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3 + %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4 + %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5 + %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6 + %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7 + %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8 + %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9 + %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10 + %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11 + %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12 + %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13 + %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14 + %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15 + %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16 + %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17 + %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18 + %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19 + %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20 + %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21 + %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22 + %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23 + %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24 + %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25 + %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26 + %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27 + %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28 + %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29 + %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30 + %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31 + %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32 + %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33 + %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34 + %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35 + %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36 + %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37 + %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38 + %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39 + %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40 + %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41 + %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42 + %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43 + %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44 + %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45 + %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46 + %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47 + %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48 + %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49 + %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50 + %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51 + %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52 + %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53 + %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54 + %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55 + %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56 + %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57 + %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58 + %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59 + %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60 + %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61 + %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62 + %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63 + %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64 + %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65 + %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66 + %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67 + %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68 + %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69 + %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70 + %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71 + %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72 + %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73 + %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74 + %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75 + %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76 + %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77 + %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78 + %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79 + %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80 + %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81 + %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82 + %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83 + %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84 + %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85 + %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86 + %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87 + %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88 + %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89 + %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90 + %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91 + %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92 + %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93 + %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94 + %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95 + %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96 + %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97 + %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98 + %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99 + %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100 + %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101 + %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102 + %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103 + %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104 + %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105 + %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106 + %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107 + %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108 + %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109 + %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110 + %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111 + %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112 + %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113 + %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114 + %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115 + %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116 + %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117 + %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118 + %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119 + %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120 + %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121 + %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122 + %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123 + %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124 + %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125 + %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126 + %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127 + %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146 + %tmp279 = extractelement <128 x float> %tmp278, i32 0 + %tmp280 = extractelement <128 x float> %tmp278, i32 1 + %tmp281 = extractelement <128 x float> %tmp278, i32 2 + %tmp282 = extractelement <128 x float> %tmp278, i32 3 + %tmp283 = extractelement <128 x float> %tmp278, i32 4 + %tmp284 = extractelement <128 x float> %tmp278, i32 5 + %tmp285 = extractelement <128 x float> %tmp278, i32 6 + %tmp286 = extractelement <128 x float> %tmp278, i32 7 + %tmp287 = extractelement <128 x float> %tmp278, i32 8 + %tmp288 = extractelement <128 x float> %tmp278, i32 9 + %tmp289 = extractelement <128 x float> %tmp278, i32 10 + %tmp290 = extractelement <128 x float> %tmp278, i32 11 + %tmp291 = extractelement <128 x float> %tmp278, i32 12 + %tmp292 = extractelement <128 x float> %tmp278, i32 13 + %tmp293 = extractelement <128 x float> %tmp278, i32 14 + %tmp294 = extractelement <128 x float> %tmp278, i32 15 + %tmp295 = extractelement <128 x float> %tmp278, i32 16 + %tmp296 = extractelement <128 x float> %tmp278, i32 17 + %tmp297 = extractelement <128 x float> %tmp278, i32 18 + %tmp298 = extractelement <128 x float> %tmp278, i32 19 + %tmp299 = extractelement <128 x float> %tmp278, i32 20 + %tmp300 = extractelement <128 x float> %tmp278, i32 21 + %tmp301 = extractelement <128 x float> %tmp278, i32 22 + %tmp302 = extractelement <128 x float> %tmp278, i32 23 + %tmp303 = extractelement <128 x float> %tmp278, i32 24 + %tmp304 = extractelement <128 x float> %tmp278, i32 25 + %tmp305 = extractelement <128 x float> %tmp278, i32 26 + %tmp306 = extractelement <128 x float> %tmp278, i32 27 + %tmp307 = extractelement <128 x float> %tmp278, i32 28 + %tmp308 = extractelement <128 x float> %tmp278, i32 29 + %tmp309 = extractelement <128 x float> %tmp278, i32 30 + %tmp310 = extractelement <128 x float> %tmp278, i32 31 + %tmp311 = extractelement <128 x float> %tmp278, i32 32 + %tmp312 = extractelement <128 x float> %tmp278, i32 33 + %tmp313 = extractelement <128 x float> %tmp278, i32 34 + %tmp314 = extractelement <128 x float> %tmp278, i32 35 + %tmp315 = extractelement <128 x float> %tmp278, i32 36 + %tmp316 = extractelement <128 x float> %tmp278, i32 37 + %tmp317 = extractelement <128 x float> %tmp278, i32 38 + %tmp318 = extractelement <128 x float> %tmp278, i32 39 + %tmp319 = extractelement <128 x float> %tmp278, i32 40 + %tmp320 = extractelement <128 x float> %tmp278, i32 41 + %tmp321 = extractelement <128 x float> %tmp278, i32 42 + %tmp322 = extractelement <128 x float> %tmp278, i32 43 + %tmp323 = extractelement <128 x float> %tmp278, i32 44 + %tmp324 = extractelement <128 x float> %tmp278, i32 45 + %tmp325 = extractelement <128 x float> %tmp278, i32 46 + %tmp326 = extractelement <128 x float> %tmp278, i32 47 + %tmp327 = extractelement <128 x float> %tmp278, i32 48 + %tmp328 = extractelement <128 x float> %tmp278, i32 49 + %tmp329 = extractelement <128 x float> %tmp278, i32 50 + %tmp330 = extractelement <128 x float> %tmp278, i32 51 + %tmp331 = extractelement <128 x float> %tmp278, i32 52 + %tmp332 = extractelement <128 x float> %tmp278, i32 53 + %tmp333 = extractelement <128 x float> %tmp278, i32 54 + %tmp334 = extractelement <128 x float> %tmp278, i32 55 + %tmp335 = extractelement <128 x float> %tmp278, i32 56 + %tmp336 = extractelement <128 x float> %tmp278, i32 57 + %tmp337 = extractelement <128 x float> %tmp278, i32 58 + %tmp338 = extractelement <128 x float> %tmp278, i32 59 + %tmp339 = extractelement <128 x float> %tmp278, i32 60 + %tmp340 = extractelement <128 x float> %tmp278, i32 61 + %tmp341 = extractelement <128 x float> %tmp278, i32 62 + %tmp342 = extractelement <128 x float> %tmp278, i32 63 + %tmp343 = extractelement <128 x float> %tmp278, i32 64 + %tmp344 = extractelement <128 x float> %tmp278, i32 65 + %tmp345 = extractelement <128 x float> %tmp278, i32 66 + %tmp346 = extractelement <128 x float> %tmp278, i32 67 + %tmp347 = extractelement <128 x float> %tmp278, i32 68 + %tmp348 = extractelement <128 x float> %tmp278, i32 69 + %tmp349 = extractelement <128 x float> %tmp278, i32 70 + %tmp350 = extractelement <128 x float> %tmp278, i32 71 + %tmp351 = extractelement <128 x float> %tmp278, i32 72 + %tmp352 = extractelement <128 x float> %tmp278, i32 73 + %tmp353 = extractelement <128 x float> %tmp278, i32 74 + %tmp354 = extractelement <128 x float> %tmp278, i32 75 + %tmp355 = extractelement <128 x float> %tmp278, i32 76 + %tmp356 = extractelement <128 x float> %tmp278, i32 77 + %tmp357 = extractelement <128 x float> %tmp278, i32 78 + %tmp358 = extractelement <128 x float> %tmp278, i32 79 + %tmp359 = extractelement <128 x float> %tmp278, i32 80 + %tmp360 = extractelement <128 x float> %tmp278, i32 81 + %tmp361 = extractelement <128 x float> %tmp278, i32 82 + %tmp362 = extractelement <128 x float> %tmp278, i32 83 + %tmp363 = extractelement <128 x float> %tmp278, i32 84 + %tmp364 = extractelement <128 x float> %tmp278, i32 85 + %tmp365 = extractelement <128 x float> %tmp278, i32 86 + %tmp366 = extractelement <128 x float> %tmp278, i32 87 + %tmp367 = extractelement <128 x float> %tmp278, i32 88 + %tmp368 = extractelement <128 x float> %tmp278, i32 89 + %tmp369 = extractelement <128 x float> %tmp278, i32 90 + %tmp370 = extractelement <128 x float> %tmp278, i32 91 + %tmp371 = extractelement <128 x float> %tmp278, i32 92 + %tmp372 = extractelement <128 x float> %tmp278, i32 93 + %tmp373 = extractelement <128 x float> %tmp278, i32 94 + %tmp374 = extractelement <128 x float> %tmp278, i32 95 + %tmp375 = extractelement <128 x float> %tmp278, i32 96 + %tmp376 = extractelement <128 x float> %tmp278, i32 97 + %tmp377 = extractelement <128 x float> %tmp278, i32 98 + %tmp378 = extractelement <128 x float> %tmp278, i32 99 + %tmp379 = extractelement <128 x float> %tmp278, i32 100 + %tmp380 = extractelement <128 x float> %tmp278, i32 101 + %tmp381 = extractelement <128 x float> %tmp278, i32 102 + %tmp382 = extractelement <128 x float> %tmp278, i32 103 + %tmp383 = extractelement <128 x float> %tmp278, i32 104 + %tmp384 = extractelement <128 x float> %tmp278, i32 105 + %tmp385 = extractelement <128 x float> %tmp278, i32 106 + %tmp386 = extractelement <128 x float> %tmp278, i32 107 + %tmp387 = extractelement <128 x float> %tmp278, i32 108 + %tmp388 = extractelement <128 x float> %tmp278, i32 109 + %tmp389 = extractelement <128 x float> %tmp278, i32 110 + %tmp390 = extractelement <128 x float> %tmp278, i32 111 + %tmp391 = extractelement <128 x float> %tmp278, i32 112 + %tmp392 = extractelement <128 x float> %tmp278, i32 113 + %tmp393 = extractelement <128 x float> %tmp278, i32 114 + %tmp394 = extractelement <128 x float> %tmp278, i32 115 + %tmp395 = extractelement <128 x float> %tmp278, i32 116 + %tmp396 = extractelement <128 x float> %tmp278, i32 117 + %tmp397 = extractelement <128 x float> %tmp278, i32 118 + %tmp398 = extractelement <128 x float> %tmp278, i32 119 + %tmp399 = extractelement <128 x float> %tmp278, i32 120 + %tmp400 = extractelement <128 x float> %tmp278, i32 121 + %tmp401 = extractelement <128 x float> %tmp278, i32 122 + %tmp402 = extractelement <128 x float> %tmp278, i32 123 + %tmp403 = extractelement <128 x float> %tmp278, i32 124 + %tmp404 = extractelement <128 x float> %tmp278, i32 125 + %tmp405 = extractelement <128 x float> %tmp278, i32 126 + %tmp406 = extractelement <128 x float> %tmp278, i32 127 + %tmp407 = bitcast float %tmp95 to i32 + %tmp408 = add i32 %tmp407, 1 + %tmp409 = bitcast i32 %tmp408 to float + br label %bb12 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 5ce65371b01c..16abb89bb0b8 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -1,7 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s - -; FIXME: Enable -verify-instructions +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; This ends up using all 255 registers and requires register ; scavenging which will fail to find an unsued register. @@ -11,9 +9,19 @@ ; FIXME: The same register is initialized to 0 for every spill. -; CHECK-LABEL: {{^}}main: -; CHECK: NumVgprs: 256 -; CHECK: ScratchSize: 1024 +; GCN-LABEL: {{^}}main: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + +; s12 is offset user SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll index 4328e964c1bf..a704a23b0f92 100644 --- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -7,9 +9,26 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] + define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -21,10 +40,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -36,10 +55,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -51,10 +70,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -66,10 +85,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -81,10 +100,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -92,74 +111,33 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -167,9 +145,25 @@ entry: } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -177,36 +171,77 @@ entry: } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 ; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { +define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 ; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { +define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 ; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { +define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -221,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0 declare i32 @llvm.r600.read.global.size.y() #0 declare i32 @llvm.r600.read.global.size.z() #0 -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 declare i32 @llvm.r600.read.tgid.z() #0 diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll index 7510d6ccdc33..573cd45c0825 100644 --- a/test/CodeGen/ARM/atomic-64bit.ll +++ b/test/CodeGen/ARM/atomic-64bit.ll @@ -208,10 +208,16 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) { define i64 @test8(i64* %ptr) { ; CHECK-LABEL: test8: ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]] +; CHECK-NOT: strexd +; CHECK: clrex +; CHECK-NOT: strexd ; CHECK: dmb {{ish$}} ; CHECK-THUMB-LABEL: test8: ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]] +; CHECK-THUMB-NOT: strexd +; CHECK-THUMB: clrex +; CHECK-THUMB-NOT: strexd ; CHECK-THUMB: dmb {{ish$}} %r = load atomic i64, i64* %ptr seq_cst, align 8 diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll index 11aa1950b879..b80191d76012 100644 --- a/test/CodeGen/ARM/build-attributes.ll +++ b/test/CodeGen/ARM/build-attributes.ll @@ -96,6 +96,9 @@ ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35 +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING @@ -129,6 +132,8 @@ ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN ; ARMv8a (AArch32) +; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=NO-STRICT-ALIGN +; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN @@ -1113,6 +1118,36 @@ ; CORTEX-R7-FAST-NOT: .eabi_attribute 22 ; CORTEX-R7-FAST: .eabi_attribute 23, 1 +; CORTEX-A35: .cpu cortex-a35 +; CORTEX-A35: .eabi_attribute 6, 14 +; CORTEX-A35: .eabi_attribute 7, 65 +; CORTEX-A35: .eabi_attribute 8, 1 +; CORTEX-A35: .eabi_attribute 9, 2 +; CORTEX-A35: .fpu crypto-neon-fp-armv8 +; CORTEX-A35: .eabi_attribute 12, 3 +; CORTEX-A35-NOT: .eabi_attribute 19 +;; We default to IEEE 754 compliance +; CORTEX-A35: .eabi_attribute 20, 1 +; CORTEX-A35: .eabi_attribute 21, 1 +; CORTEX-A35-NOT: .eabi_attribute 22 +; CORTEX-A35: .eabi_attribute 23, 3 +; CORTEX-A35: .eabi_attribute 24, 1 +; CORTEX-A35: .eabi_attribute 25, 1 +; CORTEX-A35-NOT: .eabi_attribute 27 +; CORTEX-A35-NOT: .eabi_attribute 28 +; CORTEX-A35: .eabi_attribute 36, 1 +; CORTEX-A35: .eabi_attribute 38, 1 +; CORTEX-A35: .eabi_attribute 42, 1 +; CORTEX-A35-NOT: .eabi_attribute 44 +; CORTEX-A35: .eabi_attribute 68, 3 + +; CORTEX-A35-FAST-NOT: .eabi_attribute 19 +;; The A35 has the ARMv8 FP unit, which always flushes preserving sign. +; CORTEX-A35-FAST: .eabi_attribute 20, 2 +; CORTEX-A35-FAST-NOT: .eabi_attribute 21 +; CORTEX-A35-FAST-NOT: .eabi_attribute 22 +; CORTEX-A35-FAST: .eabi_attribute 23, 1 + ; CORTEX-A53: .cpu cortex-a53 ; CORTEX-A53: .eabi_attribute 6, 14 ; CORTEX-A53: .eabi_attribute 7, 65 diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll index e17da7a97205..a44c9721d6c1 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -14,15 +14,15 @@ entry: br i1 undef, label %for.end, label %for.body ; Before if conversion, we have -; for.body -> lor.lhs.false.i (62) -; -> for.cond.backedge (62) -; lor.lhs.false.i -> for.cond.backedge (1048575) -; -> cond.false.i (1) +; for.body -> lor.lhs.false.i (50%) +; -> for.cond.backedge (50%) +; lor.lhs.false.i -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; Afer if conversion, we have -; for.body -> for.cond.backedge (130023362) -; -> cond.false.i (62) +; for.body -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; CHECK: BB#1: derived from LLVM BB %for.body -; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048) +; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll index f2a1229d0d8a..0de039cde23c 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll @@ -19,7 +19,7 @@ bb: br i1 %9, label %return, label %bb2 ; CHECK: BB#2: derived from LLVM BB %bb2 -; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%) bb2: %v10 = icmp eq i32 %3, 16 diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll index 6ce9bcb56ef4..a96b6e8a1e83 100644 --- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll +++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s +; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s declare i32 @foo(i32) declare i8* @bar(i32, i8*, i8*) @@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*) ; CHECK-NEXT: [[FOOCALL]]: ; CHECK-NEXT: blx _foo ; -; CHECK-WEIGHT: BB#0: -; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912) -; CHECK-WEIGHT: BB#1: -; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912) +; CHECK-PROB: BB#0: +; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) +; CHECK-PROB: BB#1: +; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) { entry: diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll index 95b0a202e7ff..f83f28815793 100644 --- a/test/CodeGen/ARM/tail-merge-branch-weight.ll +++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll @@ -9,7 +9,7 @@ ; = 0.2 * 0.4 + 0.8 * 0.7 = 0.64 ; CHECK: # Machine code for function test0: -; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24) +; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%) ; CHECK: BB#{{[0-9]+}}: ; CHECK: BB#{{[0-9]+}}: ; CHECK: # End machine code for function test0. diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll index 576c120b444e..799ef62416e6 100644 --- a/test/CodeGen/ARM/taildup-branch-weight.ll +++ b/test/CodeGen/ARM/taildup-branch-weight.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck %s ; CHECK: Machine code for function test0: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) { entry: @@ -30,7 +30,7 @@ B4: !0 = !{!"branch_weights", i32 4, i32 124} ; CHECK: Machine code for function test1: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) @g0 = common global i32 0, align 4 diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll index 5b9c19ab5eb2..67d1cad2cf68 100644 --- a/test/CodeGen/ARM/thumb1_return_sequence.ll +++ b/test/CodeGen/ARM/thumb1_return_sequence.ll @@ -23,9 +23,22 @@ entry: ; -------- ; CHECK-V4T: add sp, ; CHECK-V4T-NEXT: pop {[[SAVED]]} -; We do not have any SP update to insert so we can just optimize -; the pop sequence. -; CHECK-V4T-NEXT: pop {pc} +; The ISA for v4 does not support pop pc, so make sure we do not emit +; one even when we do not need to update SP. +; CHECK-V4T-NOT: pop {pc} +; We may only use lo register to pop, but in that case, all the scratch +; ones are used. +; r12 is the only register we are allowed to clobber for AAPCS. +; Use it to save a lo register. +; CHECK-V4T-NEXT: mov [[TEMP_REG:r12]], [[POP_REG:r[0-7]]] +; Pop the value of LR. +; CHECK-V4T-NEXT: pop {[[POP_REG]]} +; Copy the value of LR in the right register. +; CHECK-V4T-NEXT: mov lr, [[POP_REG]] +; Restore the value that was in the register we used to pop the value of LR. +; CHECK-V4T-NEXT: mov [[POP_REG]], [[TEMP_REG]] +; Return. +; CHECK-V4T-NEXT: bx lr ; CHECK-V5T: pop {[[SAVED]], pc} } @@ -93,7 +106,13 @@ entry: ; Epilogue ; -------- ; CHECK-V4T: pop {[[SAVED]]} -; CHECK-V4T: pop {pc} +; The ISA for v4 does not support pop pc, so make sure we do not emit +; one even when we do not need to update SP. +; CHECK-V4T-NOT: pop {pc} +; Pop the value of LR into a scratch lo register other than r0 (it is +; used for the return value). +; CHECK-V4T-NEXT: pop {[[POP_REG:r[1-3]]]} +; CHECK-V4T-NEXT: bx [[POP_REG]] ; CHECK-V5T: pop {[[SAVED]], pc} } diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll index 5a4a4672f7eb..ae3c8da21471 100644 --- a/test/CodeGen/Generic/MachineBranchProb.ll +++ b/test/CodeGen/Generic/MachineBranchProb.ll @@ -16,11 +16,11 @@ entry: i64 5, label %sw.bb1 ], !prof !0 ; CHECK: BB#0: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%) ; CHECK: BB#4: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%) ; CHECK: BB#5: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%) sw.bb: br label %return @@ -62,7 +62,7 @@ return: ret void ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree: ; CHECK: BB#0: derived from LLVM BB %entry ; CHECK-NOT: Successors -; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318) +; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%) } !1 = !{!"branch_weights", diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll index f84fd95e4fbd..341567e1d02f 100644 --- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll +++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll @@ -2,7 +2,7 @@ ; Check that the edge weights are updated correctly after if-conversion. ; CHECK: BB#3: -; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%) @a = external global i32 @d = external global i32 diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir index b5ed3b7f27e1..bce06d540114 100644 --- a/test/CodeGen/MIR/X86/newline-handling.mir +++ b/test/CodeGen/MIR/X86/newline-handling.mir @@ -35,7 +35,7 @@ liveins: # CHECK-LABEL: name: foo # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags @@ -79,7 +79,7 @@ liveins: # CHECK-LABEL: name: bar # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir index fc5e5d640f7f..64af6121189a 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir @@ -1,6 +1,6 @@ # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s # This test ensures that the MIR parser parses basic block successors and -# weights correctly. +# probabilities correctly. --- | @@ -21,10 +21,10 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(16), %bb.2.exit(32) + ; CHECK: successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: - successors: %bb.1.less (16), %bb.2.exit(32) + successors: %bb.1.less (33), %bb.2.exit(67) liveins: %edi CMP32ri8 %edi, 10, implicit-def %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir index aa80fe9fbeef..a6c14f70bc7c 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir @@ -32,7 +32,7 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(0), %bb.2.exit(0) + ; CHECK: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: successors: %bb.1.less, %bb.2.exit @@ -58,7 +58,7 @@ body: | ; Verify that we can have multiple lists of successors that will be merged ; into one. ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1(0), %bb.2(0) + ; CHECK: successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%) bb.0.entry: liveins: %edi successors: %bb.1 diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll index 58dd16c9f9c8..54092b4e3ebe 100644 --- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll +++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll @@ -1,5 +1,5 @@ ; Check that register scavenging spill slot is close to $fp. -; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s +; RUN: llc -march=mipsel -O0 < %s | FileCheck %s ; CHECK: sw ${{.*}}, 8($sp) ; CHECK: lw ${{.*}}, 8($sp) @@ -31,4 +31,4 @@ entry: ret i32 0 } -attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" } +attributes #0 = { noinline "no-frame-pointer-elim"="true" } diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll new file mode 100644 index 000000000000..f0c0deacf4dd --- /dev/null +++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll @@ -0,0 +1,305 @@ +; RUN: llc -O2 < %s | FileCheck %s +; ModuleID = 'bugpoint-reduced-simplified.bc' +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Function Attrs: nounwind +define void @_ZN10SubProcess19ScrubbedForkAndExecEiPiS0_PNS_7ResultsE() #0 align 2 { +; CHECK: lis 3, 1234 +; CHECK-NOT: li 3 +; CHECK-NOT: ori 3 +; CHECK-NOT: addi 3 +; CHECK-NOT: addis 3 +; CHECK-NOT: lis 3 +; CHECK: sc + br i1 undef, label %1, label %2 + +;