diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index c7ea1c1bf236..cf1ceab1f1c6 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1289,9 +1289,9 @@ example:
     that are recognized by LLVM to handle asynchronous exceptions, such
     as SEH, will still provide their implementation defined semantics.
 ``optnone``
-    This function attribute indicates that the function is not optimized
-    by any optimization or code generator passes with the
-    exception of interprocedural optimization passes.
+    This function attribute indicates that most optimization passes will skip
+    this function, with the exception of interprocedural optimization passes.
+    Code generation defaults to the "fast" instruction selector.
     This attribute cannot be used together with the ``alwaysinline``
     attribute; this attribute is also incompatible
     with the ``minsize`` attribute and the ``optsize`` attribute.
@@ -9337,6 +9337,48 @@ Semantics:
 
 See the description for :ref:`llvm.stacksave <int_stacksave>`.
 
+.. _int_get_dynamic_area_offset:
+
+'``llvm.get.dynamic.area.offset``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.get.dynamic.area.offset.i32()
+      declare i64 @llvm.get.dynamic.area.offset.i64()
+
+      Overview:
+      """""""""
+
+      The '``llvm.get.dynamic.area.offset.*``' intrinsic family is used to
+      get the offset from native stack pointer to the address of the most
+      recent dynamic alloca on the caller's stack. These intrinsics are
+      intendend for use in combination with
+      :ref:`llvm.stacksave <int_stacksave>` to get a
+      pointer to the most recent dynamic alloca. This is useful, for example,
+      for AddressSanitizer's stack unpoisoning routines.
+
+Semantics:
+""""""""""
+
+      These intrinsics return a non-negative integer value that can be used to
+      get the address of the most recent dynamic alloca, allocated by :ref:`alloca <i_alloca>`
+      on the caller's stack. In particular, for targets where stack grows downwards,
+      adding this offset to the native stack pointer would get the address of the most
+      recent dynamic alloca. For targets where stack grows upwards, the situation is a bit more
+      complicated, because substracting this value from stack pointer would get the address
+      one past the end of the most recent dynamic alloca.
+
+      Although for most targets `llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
+      returns just a zero, for others, such as PowerPC and PowerPC64, it returns a
+      compile-time-known constant value.
+
+      The return value type of :ref:`llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
+      must match the target's generic address space's (address space 0) pointer type.
+
 '``llvm.prefetch``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 1bd3b291e0ef..d4360fa8d218 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -371,6 +371,21 @@ bool all_of(R &&Range, UnaryPredicate &&P) {
                      std::forward<UnaryPredicate>(P));
 }
 
+/// Provide wrappers to std::any_of which take ranges instead of having to pass
+/// begin/end explicitly.
+template <typename R, class UnaryPredicate>
+bool any_of(R &&Range, UnaryPredicate &&P) {
+  return std::any_of(Range.begin(), Range.end(),
+                     std::forward<UnaryPredicate>(P));
+}
+
+/// Provide wrappers to std::find which take ranges instead of having to pass
+/// begin/end explicitly.
+template<typename R, class T>
+auto find(R &&Range, const T &val) -> decltype(Range.begin()) {
+  return std::find(Range.begin(), Range.end(), val);
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 194235fac570..700bb9e10ef7 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -232,6 +232,13 @@ class StringMap : public StringMapImpl {
     : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))),
       Allocator(A) {}
 
+  StringMap(std::initializer_list<std::pair<StringRef, ValueTy>> List)
+      : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {
+    for (const auto &P : List) {
+      insert(P);
+    }
+  }
+
   StringMap(StringMap &&RHS)
       : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {}
 
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index 3e0cc200b6dd..08626dc7af84 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -23,6 +23,11 @@ namespace llvm {
   class StringSet : public llvm::StringMap<char, AllocatorTy> {
     typedef llvm::StringMap<char, AllocatorTy> base;
   public:
+    StringSet() = default;
+    StringSet(std::initializer_list<StringRef> S) {
+      for (StringRef X : S)
+        insert(X);
+    }
 
     std::pair<typename base::iterator, bool> insert(StringRef Key) {
       assert(!Key.empty());
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index e50cec1f5e80..e01db0a61fd5 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -93,6 +93,7 @@ class Triple {
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v8_2a,
     ARMSubArch_v8_1a,
     ARMSubArch_v8,
     ARMSubArch_v7,
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 89dec14b2b3e..69dae5e90785 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -61,6 +61,9 @@ class BranchProbabilityInfo {
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
+  BranchProbability getEdgeProbability(const BasicBlock *Src,
+                                       succ_const_iterator Dst) const;
+
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
   ///
   /// Check whether this edge out of the source block is 'hot'. We define hot
diff --git a/include/llvm/Analysis/EHPersonalities.h b/include/llvm/Analysis/EHPersonalities.h
new file mode 100644
index 000000000000..4a56728fbb4a
--- /dev/null
+++ b/include/llvm/Analysis/EHPersonalities.h
@@ -0,0 +1,83 @@
+//===- EHPersonalities.h - Compute EH-related information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_EHPERSONALITIES_H
+#define LLVM_ANALYSIS_EHPERSONALITIES_H
+
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+class Function;
+class Value;
+
+enum class EHPersonality {
+  Unknown,
+  GNU_Ada,
+  GNU_C,
+  GNU_CXX,
+  GNU_ObjC,
+  MSVC_X86SEH,
+  MSVC_Win64SEH,
+  MSVC_CXX,
+  CoreCLR
+};
+
+/// \brief See if the given exception handling personality function is one
+/// that we understand.  If so, return a description of it; otherwise return
+/// Unknown.
+EHPersonality classifyEHPersonality(const Value *Pers);
+
+/// \brief Returns true if this personality function catches asynchronous
+/// exceptions.
+inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
+  // The two SEH personality functions can catch asynch exceptions. We assume
+  // unknown personalities don't catch asynch exceptions.
+  switch (Pers) {
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+/// \brief Returns true if this is a personality function that invokes
+/// handler funclets (which must return to it).
+inline bool isFuncletEHPersonality(EHPersonality Pers) {
+  switch (Pers) {
+  case EHPersonality::MSVC_CXX:
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::CoreCLR:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+/// \brief Return true if this personality may be safely removed if there
+/// are no invoke instructions remaining in the current function.
+inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
+  switch (Pers) {
+  case EHPersonality::Unknown:
+    return false;
+  // All known personalities currently have this behavior
+  default:
+    return true;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+bool canSimplifyInvokeNoUnwind(const Function *F);
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h
deleted file mode 100644
index 14ecb55f340b..000000000000
--- a/include/llvm/Analysis/LibCallSemantics.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===- LibCallSemantics.h - Describe library semantics --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines interfaces that can be used to describe language specific
-// runtime library interfaces (e.g. libc, libm, etc) to LLVM optimizers.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_LIBCALLSEMANTICS_H
-#define LLVM_ANALYSIS_LIBCALLSEMANTICS_H
-
-#include "llvm/Analysis/AliasAnalysis.h"
-
-namespace llvm {
-class InvokeInst;
-
-  enum class EHPersonality {
-    Unknown,
-    GNU_Ada,
-    GNU_C,
-    GNU_CXX,
-    GNU_ObjC,
-    MSVC_X86SEH,
-    MSVC_Win64SEH,
-    MSVC_CXX,
-    CoreCLR
-  };
-
-  /// \brief See if the given exception handling personality function is one
-  /// that we understand.  If so, return a description of it; otherwise return
-  /// Unknown.
-  EHPersonality classifyEHPersonality(const Value *Pers);
-
-  /// \brief Returns true if this personality function catches asynchronous
-  /// exceptions.
-  inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
-    // The two SEH personality functions can catch asynch exceptions. We assume
-    // unknown personalities don't catch asynch exceptions.
-    switch (Pers) {
-    case EHPersonality::MSVC_X86SEH:
-    case EHPersonality::MSVC_Win64SEH:
-      return true;
-    default: return false;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  /// \brief Returns true if this is a personality function that invokes
-  /// handler funclets (which must return to it).
-  inline bool isFuncletEHPersonality(EHPersonality Pers) {
-    switch (Pers) {
-    case EHPersonality::MSVC_CXX:
-    case EHPersonality::MSVC_X86SEH:
-    case EHPersonality::MSVC_Win64SEH:
-    case EHPersonality::CoreCLR:
-      return true;
-    default: return false;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  /// \brief Return true if this personality may be safely removed if there
-  /// are no invoke instructions remaining in the current function.
-  inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
-    switch (Pers) {
-    case EHPersonality::Unknown:
-      return false;
-    // All known personalities currently have this behavior
-    default: return true;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  bool canSimplifyInvokeNoUnwind(const Function *F);
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index da8c68aa6832..f674cc7ee56f 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -183,7 +183,7 @@ namespace llvm {
 
   protected:
     SCEVPredicateKind Kind;
-    ~SCEVPredicate() = default;
+    virtual ~SCEVPredicate();
     SCEVPredicate(const SCEVPredicate&) = default;
     SCEVPredicate &operator=(const SCEVPredicate&) = default;
 
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 415a85e99069..eedf1a61ba82 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Analysis/VectorUtils.h"
 
 namespace llvm {
 
@@ -415,21 +416,28 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
         (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace());
     auto GTI = gep_type_begin(PointerType::get(PointeeType, AS), Operands);
     for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) {
+      // We assume that the cost of Scalar GEP with constant index and the
+      // cost of Vector GEP with splat constant index are the same.
+      const ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+      if (!ConstIdx)
+        if (auto Splat = getSplatValue(*I))
+          ConstIdx = dyn_cast<ConstantInt>(Splat);
       if (isa<SequentialType>(*GTI)) {
         int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType());
-        if (const ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+        if (ConstIdx)
           BaseOffset += ConstIdx->getSExtValue() * ElementSize;
-        } else {
+        else {
           // Needs scale register.
-          if (Scale != 0) {
+          if (Scale != 0)
             // No addressing mode takes two scale registers.
             return TTI::TCC_Basic;
-          }
           Scale = ElementSize;
         }
       } else {
         StructType *STy = cast<StructType>(*GTI);
-        uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+        // For structures the index is always splat or scalar constant
+        assert(ConstIdx && "Unexpected GEP index");
+        uint64_t Field = ConstIdx->getZExtValue();
         BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field);
       }
     }
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 48ef76a9c8da..531803adf5e4 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -86,7 +86,7 @@ Value *findScalarElement(Value *V, unsigned EltNo);
 /// \brief Get splat value if the input is a splat vector or return nullptr.
 /// The value may be extracted from a splat constants vector or from
 /// a sequence of instructions that broadcast a single value into a vector.
-Value *getSplatValue(Value *V);
+const Value *getSplatValue(const Value *V);
 
 /// \brief Compute a map of integer instructions to their minimum legal type
 /// size.
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 1c08ded1656a..3e127290f378 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -71,15 +71,13 @@ namespace llvm {
                           DiagnosticHandlerFunction DiagnosticHandler);
 
   /// Parse the specified bitcode buffer, returning the function info index.
-  /// If ExportingModule is true, check for functions in the index from this
-  /// module when the combined index is built during parsing and set flag.
   /// If IsLazy is true, parse the entire function summary into
   /// the index. Otherwise skip the function summary section, and only create
   /// an index object with a map from function name to function summary offset.
   /// The index is used to perform lazy function summary reading later.
   ErrorOr<std::unique_ptr<FunctionInfoIndex>> getFunctionInfoIndex(
       MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler,
-      const Module *ExportingModule = nullptr, bool IsLazy = false);
+      bool IsLazy = false);
 
   /// This method supports lazy reading of function summary data from the
   /// combined index during function importing. When reading the combined index
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index aaf08e14f57d..4be993a9fbbb 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -754,6 +754,12 @@ namespace ISD {
     GC_TRANSITION_START,
     GC_TRANSITION_END,
 
+    /// GET_DYNAMIC_AREA_OFFSET - get offset from native SP to the address of
+    /// the most recent dynamic alloca. For most targets that would be 0, but
+    /// for some others (e.g. PowerPC, PowerPC64) that would be compile-time
+    /// known nonzero constant. The only operand here is the chain.
+    GET_DYNAMIC_AREA_OFFSET,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index a2b1a850ec76..ac87f4f901f5 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -91,13 +91,6 @@ class MachineBasicBlock
   std::vector<MachineBasicBlock *> Predecessors;
   std::vector<MachineBasicBlock *> Successors;
 
-  /// Keep track of the weights to the successors. This vector has the same
-  /// order as Successors, or it is empty if we don't use it (disable
-  /// optimization).
-  std::vector<uint32_t> Weights;
-  typedef std::vector<uint32_t>::iterator weight_iterator;
-  typedef std::vector<uint32_t>::const_iterator const_weight_iterator;
-
   /// Keep track of the probabilities to the successors. This vector has the
   /// same order as Successors, or it is empty if we don't use it (disable
   /// optimization).
@@ -440,26 +433,16 @@ class MachineBasicBlock
 
   // Machine-CFG mutators
 
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. WEIGHT parameter is stored in Weights
-  /// list and it may be used by MachineBranchProbabilityInfo analysis to
-  /// calculate branch probability.
-  ///
-  /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0);
-
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. The weight is not provided because BPI
-  /// is not available (e.g. -O0 is used), in which case edge weights won't be
-  /// used. Using this interface can save some space.
-  void addSuccessorWithoutWeight(MachineBasicBlock *Succ);
-
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. PROB parameter is stored in
-  /// Probabilities list.
+  /// Probabilities list. The default probability is set as unknown. Mixing
+  /// known and unknown probabilities in successor list is not allowed. When all
+  /// successors have unknown probabilities, 1 / N is returned as the
+  /// probability for each successor, where N is the number of successors.
   ///
   /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob);
+  void addSuccessor(MachineBasicBlock *Succ,
+                    BranchProbability Prob = BranchProbability::getUnknown());
 
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. The probability is not provided because
@@ -467,9 +450,6 @@ class MachineBasicBlock
   /// won't be used. Using this interface can save some space.
   void addSuccessorWithoutProb(MachineBasicBlock *Succ);
 
-  /// Set successor weight of a given iterator.
-  void setSuccWeight(succ_iterator I, uint32_t Weight);
-
   /// Set successor probability of a given iterator.
   void setSuccProbability(succ_iterator I, BranchProbability Prob);
 
@@ -488,7 +468,7 @@ class MachineBasicBlock
   /// Return the iterator to the element after the one removed.
   succ_iterator removeSuccessor(succ_iterator I);
 
-  /// Replace successor OLD with NEW and update weight info.
+  /// Replace successor OLD with NEW and update probability info.
   void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New);
 
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
@@ -500,9 +480,6 @@ class MachineBasicBlock
   /// operands in the successor blocks which refer to FromMBB to refer to this.
   void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB);
 
-  /// Return true if any of the successors have weights attached to them.
-  bool hasSuccessorWeights() const { return !Weights.empty(); }
-
   /// Return true if any of the successors have probabilities attached to them.
   bool hasSuccessorProbabilities() const { return !Probs.empty(); }
 
@@ -759,10 +736,6 @@ class MachineBasicBlock
 
 
 private:
-  /// Return weight iterator corresponding to the I successor iterator.
-  weight_iterator getWeightIterator(succ_iterator I);
-  const_weight_iterator getWeightIterator(const_succ_iterator I) const;
-
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
   const_probability_iterator
@@ -771,11 +744,6 @@ class MachineBasicBlock
   friend class MachineBranchProbabilityInfo;
   friend class MIPrinter;
 
-  /// Return weight of the edge from this block to MBB. This method should NOT
-  /// be called directly, but by using getEdgeWeight method from
-  /// MachineBranchProbabilityInfo class.
-  uint32_t getSuccWeight(const_succ_iterator Succ) const;
-
   /// Return probability of the edge from this block to MBB. This method should
   /// NOT be called directly, but by using getEdgeProbability method from
   /// MachineBranchProbabilityInfo class.
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 058ab32f3aa9..608e8d257874 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                          MachineBasicBlock::const_succ_iterator Dst) const;
 
-  // Get sum of the block successors' weights, potentially scaling them to fit
-  // within 32-bits. If scaling is required, sets Scale based on the necessary
-  // adjustment. Any edge weights used with the sum should be divided by Scale.
-  uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const;
+  // Return edge probability.
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+
+  // Same as above, but using a const_succ_iterator from Src. This is faster
+  // when the iterator is already available.
+  BranchProbability
+  getEdgeProbability(const MachineBasicBlock *Src,
+                     MachineBasicBlock::const_succ_iterator Dst) const;
 
   // A 'Hot' edge is an edge which probability is >= 80%.
   bool isEdgeHot(const MachineBasicBlock *Src,
@@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   // NB: This routine's complexity is linear on the number of successors.
   MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const;
 
-  // Return a probability as a fraction between 0 (0% probability) and
-  // 1 (100% probability), however the value is never equal to 0, and can be 1
-  // only iff SRC block has only one successor.
-  // NB: This routine's complexity is linear on the number of successors of
-  // Src. Querying sequentially for each successor's probability is a quadratic
-  // query pattern.
-  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
-                                       const MachineBasicBlock *Dst) const;
-
   // Print value between 0 (0% probability) and 1 (100% probability),
   // however the value is never equal to 0, and can be 1 only iff SRC block
   // has only one successor.
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index fd42b46476c5..43b9f5203c50 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -35,7 +35,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index cbfd8a37eaa6..0a1f62006327 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -73,7 +73,7 @@ class MachineRegisterInfo {
 
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
-  std::vector<MachineOperand *> PhysRegUseDefLists;
+  std::unique_ptr<MachineOperand *[]> PhysRegUseDefLists;
 
   /// getRegUseDefListHead - Return the head pointer for the register use/def
   /// list for the specified virtual or physical register.
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 166bd8686891..e296701d8e8c 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -328,16 +328,12 @@ class RegPressureTracker {
   // position changes while pressure does not.
   void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; }
 
-  /// \brief Get the SlotIndex for the first nondebug instruction including or
-  /// after the current position.
-  SlotIndex getCurrSlot() const;
-
   /// Recede across the previous instruction.
-  bool recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
+  void recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
               PressureDiff *PDiff = nullptr);
 
   /// Advance across the current instruction.
-  bool advance();
+  void advance();
 
   /// Finalize the region boundaries and recored live ins and live outs.
   void closeRegion();
@@ -354,8 +350,7 @@ class RegPressureTracker {
   ArrayRef<unsigned> getLiveThru() const { return LiveThruPressure; }
 
   /// Get the resulting register pressure over the traversed region.
-  /// This result is complete if either advance() or recede() has returned true,
-  /// or if closeRegion() was explicitly invoked.
+  /// This result is complete if closeRegion() was explicitly invoked.
   RegisterPressure &getPressure() { return P; }
   const RegisterPressure &getPressure() const { return P; }
 
@@ -365,9 +360,6 @@ class RegPressureTracker {
     return CurrSetPressure;
   }
 
-  void discoverLiveOut(unsigned Reg);
-  void discoverLiveIn(unsigned Reg);
-
   bool isTopClosed() const;
   bool isBottomClosed() const;
 
@@ -442,6 +434,13 @@ class RegPressureTracker {
   void dump() const;
 
 protected:
+  void discoverLiveOut(unsigned Reg);
+  void discoverLiveIn(unsigned Reg);
+
+  /// \brief Get the SlotIndex for the first nondebug instruction including or
+  /// after the current position.
+  SlotIndex getCurrSlot() const;
+
   const LiveRange *getLiveRange(unsigned Reg) const;
 
   void increaseRegPressure(ArrayRef<unsigned> Regs);
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 6517a58f5c07..122c78534253 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -74,10 +74,6 @@ class RegScavenger {
   /// Start tracking liveness from the begin of the specific basic block.
   void enterBasicBlock(MachineBasicBlock *mbb);
 
-  /// Allow resetting register state info for multiple
-  /// passes over/within the same function.
-  void initRegState();
-
   /// Move the internal MBB iterator and update register states.
   void forward();
 
@@ -180,6 +176,9 @@ class RegScavenger {
                            unsigned InstrLimit,
                            MachineBasicBlock::iterator &UseMI);
 
+  /// Allow resetting register state info for multiple
+  /// passes over/within the same function.
+  void initRegState();
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h
index 5def70692ba5..5e8bb56eb617 100644
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@@ -22,6 +22,7 @@
 namespace llvm {
 class AllocaInst;
 class BasicBlock;
+class CatchReturnInst;
 class Constant;
 class Function;
 class GlobalVariable;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index 0d49d7c0cf82..a85c2f9f0a23 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -41,12 +41,15 @@ class DWARFUnitIndex {
 
 public:
   class Entry {
-    const DWARFUnitIndex *Index;
-    uint64_t Signature;
+  public:
     struct SectionContribution {
       uint32_t Offset;
       uint32_t Length;
     };
+
+  private:
+    const DWARFUnitIndex *Index;
+    uint64_t Signature;
     std::unique_ptr<SectionContribution[]> Contributions;
     friend class DWARFUnitIndex;
 
@@ -55,16 +58,21 @@ class DWARFUnitIndex {
     const SectionContribution *getOffset() const;
   };
 
+private:
   struct Header Header;
 
+  DWARFSectionKind InfoColumnKind;
   int InfoColumn = -1;
   std::unique_ptr<DWARFSectionKind[]> ColumnKinds;
   std::unique_ptr<Entry[]> Rows;
 
   static StringRef getColumnHeader(DWARFSectionKind DS);
+  bool parseImpl(DataExtractor IndexData);
 
 public:
   bool parse(DataExtractor IndexData);
+  DWARFUnitIndex(DWARFSectionKind InfoColumnKind)
+      : InfoColumnKind(InfoColumnKind) {}
   void dump(raw_ostream &OS) const;
   const Entry *getFromOffset(uint32_t Offset) const;
 };
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 52a9ca83013b..286a6dae37d8 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -227,6 +227,11 @@ class AttributeSet {
   AttributeSet addAttribute(LLVMContext &C, unsigned Index,
                             StringRef Kind, StringRef Value) const;
 
+  /// Add an attribute to the attribute set at the given indices. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttribute(LLVMContext &C, ArrayRef<unsigned> Indices,
+                            Attribute A) const;
+
   /// \brief Add attributes to the attribute set at the given index. Because
   /// attribute sets are immutable, this returns a new set.
   AttributeSet addAttributes(LLVMContext &C, unsigned Index,
diff --git a/include/llvm/IR/FunctionInfo.h b/include/llvm/IR/FunctionInfo.h
index b8801693ab51..eba088a61bc0 100644
--- a/include/llvm/IR/FunctionInfo.h
+++ b/include/llvm/IR/FunctionInfo.h
@@ -165,19 +165,8 @@ class FunctionInfoIndex {
   /// Holds strings for combined index, mapping to the corresponding module ID.
   ModulePathStringTableTy ModulePathStringTable;
 
-  /// The main module being compiled, that we are importing into, if applicable.
-  /// Used to check if any of its functions are in the index and therefore
-  /// potentially exported.
-  const Module *ExportingModule;
-
-  /// Flag indicating whether the exporting module has any functions in the
-  /// index and therefore potentially exported (imported into another module).
-  bool HasExportedFunctions;
-
 public:
-  FunctionInfoIndex(const Module *M = nullptr)
-      : ExportingModule(M), HasExportedFunctions(false){};
-  ~FunctionInfoIndex() = default;
+  FunctionInfoIndex() = default;
 
   // Disable the copy constructor and assignment operators, so
   // no unexpected copying/moving occurs.
@@ -201,14 +190,6 @@ class FunctionInfoIndex {
 
   /// Add a function info for a function of the given name.
   void addFunctionInfo(StringRef FuncName, std::unique_ptr<FunctionInfo> Info) {
-    // Update the HasExportedFunctions flag, but only if we had a function
-    // summary (i.e. we aren't parsing them lazily or have a bitcode file
-    // without a function summary section).
-    if (ExportingModule && Info->functionSummary()) {
-      if (ExportingModule->getModuleIdentifier() ==
-          Info->functionSummary()->modulePath())
-        HasExportedFunctions = true;
-    }
     FunctionMap[FuncName].push_back(std::move(Info));
   }
 
@@ -248,11 +229,10 @@ class FunctionInfoIndex {
   }
 
   /// Check if the given Module has any functions available for exporting
-  /// in the index.
-  bool hasExportedFunctions(const Module *M) const {
-    assert(M == ExportingModule &&
-           "Checking for exported functions on unexpected module");
-    return HasExportedFunctions;
+  /// in the index. We consider any module present in the ModulePathStringTable
+  /// to have exported functions.
+  bool hasExportedFunctions(const Module &M) const {
+    return ModulePathStringTable.count(M.getModuleIdentifier());
   }
 };
 
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 917cf56e2e88..e838fb332de9 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -304,6 +304,8 @@ def int_stacksave     : Intrinsic<[llvm_ptr_ty]>,
 def int_stackrestore  : Intrinsic<[], [llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_restore">;
 
+def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>;
+
 // IntrReadWriteArgMem is more pessimistic than strictly necessary for prefetch,
 // however it does conveniently prevent the prefetch from being reordered
 // with respect to nearby accesses to the same memory.
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 7ec0f4b86f11..57ad278a68bd 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -6304,6 +6304,12 @@ let TargetPrefix = "x86" in {
 // Compares
 let TargetPrefix = "x86" in {
   // 512-bit
+  def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
+                         llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
+                         llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_pcmpeq_b_512 : GCCBuiltin<"__builtin_ia32_pcmpeqb512_mask">,
         Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
                   [IntrNoMem]>;
diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index 0d3c79bf5e84..c322288a1ae9 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -39,7 +39,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
@@ -49,6 +48,7 @@ namespace llvm {
   class LLVMContext;
   class DiagnosticInfo;
   class GlobalValue;
+  class Linker;
   class Mangler;
   class MemoryBuffer;
   class TargetLibraryInfo;
@@ -171,7 +171,7 @@ struct LTOCodeGenerator {
   std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   std::unique_ptr<Module> MergedModule;
-  Linker IRLinker;
+  std::unique_ptr<Linker> IRLinker;
   std::unique_ptr<TargetMachine> TargetMach;
   bool EmitDwarfDebugInfo = false;
   bool ScopeRestrictionsDone = false;
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 610b1ddf9893..0c7dc910a65c 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -68,36 +68,33 @@ class Linker {
     InternalizeLinkedSymbols = (1 << 2)
   };
 
-  Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
-  Linker(Module *M);
+  Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
+  Linker(Module &M);
 
-  Module *getModule() const { return Composite; }
-  void deleteModule();
+  Module &getModule() const { return Composite; }
 
   /// \brief Link \p Src into the composite. The source is destroyed.
+  ///
   /// Passing OverrideSymbols as true will have symbols from Src
   /// shadow those in the Dest.
   /// For ThinLTO function importing/exporting the \p FunctionInfoIndex
-  /// is passed. If a \p FuncToImport is provided, only that single
-  /// function is imported from the source module.
+  /// is passed. If \p FunctionsToImport is provided, only the functions that
+  /// are part of the set will be imported from the source module.
+  ///
   /// Returns true on error.
-  bool linkInModule(Module *Src, unsigned Flags = Flags::None,
+  bool linkInModule(Module &Src, unsigned Flags = Flags::None,
                     const FunctionInfoIndex *Index = nullptr,
-                    Function *FuncToImport = nullptr);
+                    DenseSet<const GlobalValue *> *FunctionsToImport = nullptr);
 
-  /// \brief Set the composite to the passed-in module.
-  void setModule(Module *Dst);
-
-  static bool LinkModules(Module *Dest, Module *Src,
+  static bool linkModules(Module &Dest, Module &Src,
                           DiagnosticHandlerFunction DiagnosticHandler,
                           unsigned Flags = Flags::None);
 
-  static bool LinkModules(Module *Dest, Module *Src,
+  static bool linkModules(Module &Dest, Module &Src,
                           unsigned Flags = Flags::None);
 
 private:
-  void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
-  Module *Composite;
+  Module &Composite;
 
   IdentifiedStructTypeSet IdentifiedStructTypes;
 
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index eaca3833dc89..388a208fb4a0 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -116,6 +116,9 @@ class MCObjectFileInfo {
   MCSection *DwarfStrOffDWOSection;
   MCSection *DwarfAddrSection;
 
+  // These are for Fission DWP files.
+  MCSection *DwarfCUIndexSection;
+
   /// Section for newer gnu pubnames.
   MCSection *DwarfGnuPubNamesSection;
   /// Section for newer gnu pubtypes.
@@ -262,6 +265,7 @@ class MCObjectFileInfo {
   MCSection *getDwarfLocDWOSection() const { return DwarfLocDWOSection; }
   MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; }
   MCSection *getDwarfAddrSection() const { return DwarfAddrSection; }
+  MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
 
   MCSection *getCOFFDebugSymbolsSection() const {
     return COFFDebugSymbolsSection;
diff --git a/include/llvm/Object/FunctionIndexObjectFile.h b/include/llvm/Object/FunctionIndexObjectFile.h
index 511a237881ed..74b461dc7cc7 100644
--- a/include/llvm/Object/FunctionIndexObjectFile.h
+++ b/include/llvm/Object/FunctionIndexObjectFile.h
@@ -88,7 +88,7 @@ class FunctionIndexObjectFile : public SymbolicFile {
   /// summary/index.
   static ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
   create(MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler,
-         const Module *ExportingModule = nullptr, bool IsLazy = false);
+         bool IsLazy = false);
 
   /// \brief Parse the function summary information for function with the
   /// given name out of the given buffer. Parsed information is
@@ -104,8 +104,7 @@ class FunctionIndexObjectFile : public SymbolicFile {
 /// index object if found, or nullptr if not.
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 getFunctionIndexForFile(StringRef Path,
-                        DiagnosticHandlerFunction DiagnosticHandler,
-                        const Module *ExportingModule = nullptr);
+                        DiagnosticHandlerFunction DiagnosticHandler);
 }
 
 #endif
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 13f6c70b3e83..956485119102 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -428,19 +428,22 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
   if (Counts.size() != Other.Counts.size())
     return instrprof_error::count_mismatch;
 
+  instrprof_error Result = instrprof_error::success;
+
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
-    if (Counts[I] + Other.Counts[I] < Counts[I])
-      return instrprof_error::counter_overflow;
-    Counts[I] += Other.Counts[I];
+    bool ResultOverflowed;
+    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
+    if (ResultOverflowed)
+      Result = instrprof_error::counter_overflow;
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
-    instrprof_error result = mergeValueProfData(Kind, Other);
-    if (result != instrprof_error::success)
-      return result;
+    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other);
+    if (MergeValueResult != instrprof_error::success)
+      Result = MergeValueResult;
   }
 
-  return instrprof_error::success;
+  return Result;
 }
 
 inline support::endianness getHostEndianness() {
diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index 8ff7003a0b2d..aefdbc1b3e47 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -291,6 +291,7 @@ typedef struct ValueProfData {
    */
   void deserializeTo(InstrProfRecord &Record,
                      InstrProfRecord::ValueMapType *VMap);
+  void operator delete(void *ptr) { ::operator delete(ptr); }
 #endif
 } ValueProfData;
 
@@ -537,12 +538,13 @@ int initializeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord,
     }
     NumValueKinds++;
     RuntimeRecord->SiteCountArray[I] = (uint8_t *)calloc(N, 1);
-    RuntimeRecord->NodesKind[I] = &RuntimeRecord->Nodes[S];
-    if (!RuntimeRecord->NodesKind[I])
+    if (!RuntimeRecord->SiteCountArray[I])
       return 1;
+    RuntimeRecord->NodesKind[I] = Nodes ? &Nodes[S] : NULL;
     for (J = 0; J < N; J++) {
+      /* Compute value count for each site. */
       uint32_t C = 0;
-      ValueProfNode *Site = RuntimeRecord->Nodes[S + J];
+      ValueProfNode *Site = Nodes ? RuntimeRecord->NodesKind[I][J] : NULL;
       while (Site) {
         C++;
         Site = Site->Next;
@@ -595,6 +597,8 @@ void getValueForSiteRT(const void *R, InstrProfValueData *Dst, uint32_t VK,
   unsigned I, N = 0;
   const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
   N = getNumValueDataForSiteRT(R, VK, S);
+  if (N == 0)
+    return;
   ValueProfNode *VNode = Record->NodesKind[VK][S];
   for (I = 0; I < N; I++) {
     Dst[I] = VNode->VData;
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index e0a93cec3208..49233366e164 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -259,36 +259,50 @@ class InstrProfLookupTrait {
   }
 };
 
-class InstrProfReaderIndex {
- private:
-  typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait> IndexType;
+struct InstrProfReaderIndexBase {
+  // Read all the profile records with the same key pointed to the current
+  // iterator.
+  virtual std::error_code getRecords(ArrayRef<InstrProfRecord> &Data) = 0;
+  // Read all the profile records with the key equal to FuncName
+  virtual std::error_code getRecords(StringRef FuncName,
+                                     ArrayRef<InstrProfRecord> &Data) = 0;
+  virtual void advanceToNextKey() = 0;
+  virtual bool atEnd() const = 0;
+  virtual void setValueProfDataEndianness(support::endianness Endianness) = 0;
+  virtual ~InstrProfReaderIndexBase() {}
+};
+
+typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
+    OnDiskHashTableImplV3;
 
-  std::unique_ptr<IndexType> Index;
-  IndexType::data_iterator RecordIterator;
+template <typename HashTableImpl>
+class InstrProfReaderIndex : public InstrProfReaderIndexBase {
+
+private:
+  std::unique_ptr<HashTableImpl> HashTable;
+  typename HashTableImpl::data_iterator RecordIterator;
   uint64_t FormatVersion;
 
   // String table for holding a unique copy of all the strings in the profile.
   InstrProfStringTable StringTable;
 
- public:
-  InstrProfReaderIndex() : Index(nullptr) {}
-  void Init(const unsigned char *Buckets, const unsigned char *const Payload,
-            const unsigned char *const Base, IndexedInstrProf::HashT HashType,
-            uint64_t Version);
+public:
+  InstrProfReaderIndex(const unsigned char *Buckets,
+                       const unsigned char *const Payload,
+                       const unsigned char *const Base,
+                       IndexedInstrProf::HashT HashType, uint64_t Version);
 
-  // Read all the pofile records with the same key pointed to the current
-  // iterator.
-  std::error_code getRecords(ArrayRef<InstrProfRecord> &Data);
-  // Read all the profile records with the key equal to FuncName
+  std::error_code getRecords(ArrayRef<InstrProfRecord> &Data) override;
   std::error_code getRecords(StringRef FuncName,
-                             ArrayRef<InstrProfRecord> &Data);
-
-  void advanceToNextKey() { RecordIterator++; }
-  bool atEnd() const { return RecordIterator == Index->data_end(); }
-  // Used for testing purpose only.
-  void setValueProfDataEndianness(support::endianness Endianness) {
-    Index->getInfoObj().setValueProfDataEndianness(Endianness);
+                             ArrayRef<InstrProfRecord> &Data) override;
+  void advanceToNextKey() override { RecordIterator++; }
+  bool atEnd() const override {
+    return RecordIterator == HashTable->data_end();
+  }
+  void setValueProfDataEndianness(support::endianness Endianness) override {
+    HashTable->getInfoObj().setValueProfDataEndianness(Endianness);
   }
+  ~InstrProfReaderIndex() override {}
 };
 
 /// Reader for the indexed binary instrprof format.
@@ -297,16 +311,16 @@ class IndexedInstrProfReader : public InstrProfReader {
   /// The profile data file contents.
   std::unique_ptr<MemoryBuffer> DataBuffer;
   /// The index into the profile data.
-  InstrProfReaderIndex Index;
+  std::unique_ptr<InstrProfReaderIndexBase> Index;
   /// The maximal execution count among all functions.
   uint64_t MaxFunctionCount;
 
   IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
- public:
+public:
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), Index() {}
+      : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
 
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
@@ -337,7 +351,7 @@ class IndexedInstrProfReader : public InstrProfReader {
 
   // Used for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness) {
-    Index.setValueProfDataEndianness(Endianness);
+    Index->setValueProfDataEndianness(Endianness);
   }
 };
 
diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index f76ac8899359..2f99b0717adf 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -88,6 +88,9 @@ ARM_ARCH("armv8-a", AK_ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8,
 ARM_ARCH("armv8.1-a", AK_ARMV8_1A, "8.1-A", "v8.1a", ARMBuildAttrs::CPUArch::v8,
           FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM |
                              AEK_HWDIV | AEK_DSP | AEK_CRC))
+ARM_ARCH("armv8.2-a", AK_ARMV8_2A, "8.2-A", "v8.2a", ARMBuildAttrs::CPUArch::v8,
+          FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM |
+                             AEK_HWDIV | AEK_DSP | AEK_CRC))
 // Non-standard Arch names.
 ARM_ARCH("iwmmxt", AK_IWMMXT, "iwmmxt", "", ARMBuildAttrs::CPUArch::v5TE,
           FK_NONE, AEK_NONE)
@@ -115,6 +118,7 @@ ARM_ARCH_EXT_NAME("mp",       AEK_MP,       nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("simd",     AEK_SIMD,     nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("sec",      AEK_SEC,      nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("virt",     AEK_VIRT,     nullptr,  nullptr)
+ARM_ARCH_EXT_NAME("fp16",     AEK_FP16,     "+fullfp16",  "-fullfp16")
 ARM_ARCH_EXT_NAME("os",       AEK_OS,       nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("iwmmxt",   AEK_IWMMXT,   nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("iwmmxt2",  AEK_IWMMXT2,  nullptr,  nullptr)
@@ -204,6 +208,7 @@ ARM_CPU_NAME("sc300", AK_ARMV7M, FK_NONE, false, AEK_NONE)
 ARM_CPU_NAME("cortex-m3", AK_ARMV7M, FK_NONE, true, AEK_NONE)
 ARM_CPU_NAME("cortex-m4", AK_ARMV7EM, FK_FPV4_SP_D16, true, AEK_NONE)
 ARM_CPU_NAME("cortex-m7", AK_ARMV7EM, FK_FPV5_D16, false, AEK_NONE)
+ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, AEK_CRC)
 ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index 3620d4d5d772..2548384f346b 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -53,6 +53,9 @@ class BranchProbability {
   // Create a BranchProbability object with the given numerator and 1<<31
   // as denominator.
   static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); }
+  // Create a BranchProbability object from 64-bit integers.
+  static BranchProbability getBranchProbability(uint64_t Numerator,
+                                                uint64_t Denominator);
 
   // Normalize given probabilties so that the sum of them becomes approximate
   // one.
@@ -131,10 +134,30 @@ class BranchProbability {
 
   bool operator==(BranchProbability RHS) const { return N == RHS.N; }
   bool operator!=(BranchProbability RHS) const { return !(*this == RHS); }
-  bool operator<(BranchProbability RHS) const { return N < RHS.N; }
-  bool operator>(BranchProbability RHS) const { return RHS < *this; }
-  bool operator<=(BranchProbability RHS) const { return !(RHS < *this); }
-  bool operator>=(BranchProbability RHS) const { return !(*this < RHS); }
+
+  bool operator<(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return N < RHS.N;
+  }
+
+  bool operator>(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return RHS < *this;
+  }
+
+  bool operator<=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(RHS < *this);
+  }
+
+  bool operator>=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(*this < RHS);
+  }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) {
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index c47134f46c8d..ac978d4c242c 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -263,11 +263,12 @@ template <typename Info> class OnDiskChainedHashTable {
   Info InfoObj;
 
 public:
+  typedef Info InfoType;
   typedef typename Info::internal_key_type internal_key_type;
   typedef typename Info::external_key_type external_key_type;
-  typedef typename Info::data_type         data_type;
-  typedef typename Info::hash_value_type   hash_value_type;
-  typedef typename Info::offset_type       offset_type;
+  typedef typename Info::data_type data_type;
+  typedef typename Info::hash_value_type hash_value_type;
+  typedef typename Info::offset_type offset_type;
 
   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
                          const unsigned char *Buckets,
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 6ca0281515e2..c21019d0c5b8 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -82,6 +82,7 @@ enum ArchExtKind : unsigned {
   AEK_SEC = 0x100,
   AEK_VIRT = 0x200,
   AEK_DSP = 0x400,
+  AEK_FP16 = 0x800,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index cb5a5796e983..819458dbb0f0 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -130,10 +130,12 @@ class TargetLoweringBase {
   /// support for these atomic instructions, and also have different options
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
-    None,      // Don't expand the instruction.
-    LLSC,      // Expand the instruction into loadlinked/storeconditional; used
-               // by ARM/AArch64.
-    CmpXChg,   // Expand the instruction into cmpxchg; used by at least X86.
+    None,    // Don't expand the instruction.
+    LLSC,    // Expand the instruction into loadlinked/storeconditional; used
+             // by ARM/AArch64.
+    LLOnly,  // Expand the (load) instruction into just a load-linked, which has
+             // greater atomic guarantees than a normal load.
+    CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
   };
 
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b40e4a69a4d2..b7760a61806f 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -102,6 +102,7 @@ class TargetMachine {
   const MCSubtargetInfo *STI;
 
   unsigned RequireStructuredCFG : 1;
+  unsigned O0WantsFastISel : 1;
 
   /// This API is here to support the C API, deprecated in 3.7 release.
   /// This should never be used outside of legacy existing client.
@@ -190,6 +191,8 @@ class TargetMachine {
   void setOptLevel(CodeGenOpt::Level Level) const;
 
   void setFastISel(bool Enable) { Options.EnableFastISel = Enable; }
+  bool getO0WantsFastISel() { return O0WantsFastISel; }
+  void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; }
 
   bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
 
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index f06a19021750..0315c72811c1 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -18,15 +18,26 @@ class LLVMContext;
 class Module;
 class FunctionInfoIndex;
 
-/// The function importer is automatically importing function from other modules
-/// based on the provided summary informations.
-class FunctionImporter {
+/// Helper to load on demand a Module from file and cache it for subsequent
+/// queries. It can be used with the FunctionImporter.
+class ModuleLazyLoaderCache {
+  /// The context that will be used for importing.
+  LLVMContext &Context;
 
   /// Cache of lazily loaded module for import.
   StringMap<std::unique_ptr<Module>> ModuleMap;
 
-  /// The context that will be used for importing.
-  LLVMContext &Context;
+public:
+  /// Create the loader, Module will be initialized in \p Context.
+  ModuleLazyLoaderCache(LLVMContext &Context) : Context(Context) {}
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  Module &operator()(StringRef FileName);
+};
+
+/// The function importer is automatically importing function from other modules
+/// based on the provided summary informations.
+class FunctionImporter {
 
   /// The summaries index used to trigger importing.
   const FunctionInfoIndex &Index;
@@ -35,13 +46,15 @@ class FunctionImporter {
   DiagnosticHandlerFunction DiagnosticHandler;
 
   /// Retrieve a Module from the cache or lazily load it on demand.
-  Module &getOrLoadModule(StringRef FileName);
+  std::function<Module &(StringRef FileName)> getLazyModule;
 
 public:
   /// Create a Function Importer.
-  FunctionImporter(LLVMContext &Context, const FunctionInfoIndex &Index,
-                   DiagnosticHandlerFunction DiagnosticHandler)
-      : Context(Context), Index(Index), DiagnosticHandler(DiagnosticHandler) {}
+  FunctionImporter(const FunctionInfoIndex &Index,
+                   DiagnosticHandlerFunction DiagnosticHandler,
+                   std::function<Module &(StringRef FileName)> ModuleLoader)
+      : Index(Index), DiagnosticHandler(DiagnosticHandler),
+        getLazyModule(ModuleLoader) {}
 
   /// Import functions in Module \p M based on the summary informations.
   bool importFunctions(Module &M);
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index b7d67eaea3a0..1d707a1e5307 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -271,10 +271,20 @@ bool LowerDbgDeclare(Function &F);
 /// an alloca, if any.
 DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
 
-/// \brief Replaces llvm.dbg.declare instruction when an alloca is replaced with
-/// a new value. If Deref is true, an additional DW_OP_deref is prepended to the
-/// expression. If Offset is non-zero, a constant displacement is added to the
-/// expression (after the optional Deref). Offset can be negative.
+/// \brief Replaces llvm.dbg.declare instruction when the address it describes
+/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is
+/// prepended to the expression. If Offset is non-zero, a constant displacement
+/// is added to the expression (after the optional Deref). Offset can be
+/// negative.
+bool replaceDbgDeclare(Value *Address, Value *NewAddress,
+                       Instruction *InsertBefore, DIBuilder &Builder,
+                       bool Deref, int Offset);
+
+/// \brief Replaces llvm.dbg.declare instruction when the alloca it describes
+/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is
+/// prepended to the expression. If Offset is non-zero, a constant displacement
+/// is added to the expression (after the optional Deref). Offset can be
+/// negative. New llvm.dbg.declare is inserted immediately before AI.
 bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                 DIBuilder &Builder, bool Deref, int Offset = 0);
 
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index f48394698699..6cdf43a06a9f 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
   return BranchProbability(N, D);
 }
 
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          succ_const_iterator Dst) const {
+  return getEdgeProbability(Src, Dst.getSuccessorIndex());
+}
+
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index cb5cd07493b6..69623619a8b0 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
+  EHPersonalities.cpp
   GlobalsModRef.cpp
   IVUsers.cpp
   InlineCost.cpp
@@ -35,7 +36,6 @@ add_llvm_library(LLVMAnalysis
   IteratedDominanceFrontier.cpp
   LazyCallGraph.cpp
   LazyValueInfo.cpp
-  LibCallSemantics.cpp
   Lint.cpp
   Loads.cpp
   LoopAccessAnalysis.cpp
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/EHPersonalities.cpp
similarity index 83%
rename from lib/Analysis/LibCallSemantics.cpp
rename to lib/Analysis/EHPersonalities.cpp
index b91ff20aee25..1d1b5fe11f67 100644
--- a/lib/Analysis/LibCallSemantics.cpp
+++ b/lib/Analysis/EHPersonalities.cpp
@@ -1,4 +1,4 @@
-//===- LibCallSemantics.cpp - Describe library semantics ------------------===//
+//===- EHPersonalities.cpp - Compute EH-related information ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,14 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements interfaces that can be used to describe language
-// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM
-// optimizers.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
 using namespace llvm;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 23daeb67d653..9a0570d47f02 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7964,8 +7964,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr,
   const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr);
   if (!MaxExpr) return false;
 
-  auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate);
-  return It != MaxExpr->op_end();
+  return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end();
 }
 
 
@@ -8403,8 +8402,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
 
   // The only time we can solve this is when we have all constant indices.
   // Otherwise, we cannot determine the overflow conditions.
-  if (std::any_of(op_begin(), op_end(),
-                  [](const SCEV *Op) { return !isa<SCEVConstant>(Op);}))
+  if (any_of(operands(), [](const SCEV *Op) { return !isa<SCEVConstant>(Op); }))
     return SE.getCouldNotCompute();
 
   // Okay at this point we know that all elements of the chrec are constants and
@@ -9645,6 +9643,8 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
                              SCEVPredicateKind Kind)
     : FastID(ID), Kind(Kind) {}
 
+SCEVPredicate::~SCEVPredicate() {}
+
 SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
                                        const SCEVUnknown *LHS,
                                        const SCEVConstant *RHS)
@@ -9694,8 +9694,8 @@ bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
     return false;
   auto &SCEVPreds = ScevPredsIt->second;
 
-  return std::any_of(SCEVPreds.begin(), SCEVPreds.end(),
-                     [N](const SCEVPredicate *I) { return I->implies(N); });
+  return any_of(SCEVPreds,
+                [N](const SCEVPredicate *I) { return I->implies(N); });
 }
 
 const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index fbf387b3ee20..5fb517e8edb5 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -417,9 +417,11 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 /// the input value is (1) a splat constants vector or (2) a sequence
 /// of instructions that broadcast a single value into a vector.
 ///
-llvm::Value *llvm::getSplatValue(Value *V) {
-  if (auto *CV = dyn_cast<ConstantDataVector>(V))
-    return CV->getSplatValue();
+const llvm::Value *llvm::getSplatValue(const Value *V) {
+
+  if (auto *C = dyn_cast<Constant>(V))
+    if (isa<VectorType>(V->getType()))
+      return C->getSplatValue();
 
   auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
   if (!ShuffleInst)
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 11c9b131da70..e95aba771b9c 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5991,12 +5991,11 @@ llvm::getBitcodeProducerString(MemoryBufferRef Buffer, LLVMContext &Context,
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 llvm::getFunctionInfoIndex(MemoryBufferRef Buffer,
                            DiagnosticHandlerFunction DiagnosticHandler,
-                           const Module *ExportingModule, bool IsLazy) {
+                           bool IsLazy) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
   FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy);
 
-  std::unique_ptr<FunctionInfoIndex> Index =
-      llvm::make_unique<FunctionInfoIndex>(ExportingModule);
+  auto Index = llvm::make_unique<FunctionInfoIndex>();
 
   auto cleanupOnError = [&](std::error_code EC) {
     R.releaseBuffer(); // Never take ownership on error.
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 845408f15f5c..4060db74a9b7 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -364,9 +364,11 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
-    // defined in a call must not be changed (ABI).
+    // defined in a call must not be changed (ABI). Inline assembly may
+    // reference either system calls or the register directly. Skip it until we
+    // can tell user specified registers from compiler-specified.
     if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
-        TII->isPredicated(MI)) {
+        TII->isPredicated(MI) || MI->isInlineAsm()) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -428,6 +430,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // If MI's uses have special allocation requirement, don't allow
   // any use registers to be changed. Also assume all registers
   // used in a call must not be changed (ABI).
+  // Inline Assembly register uses also cannot be safely changed.
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservatively here because the kill markers cannot be trusted after
   // if-conversion:
@@ -443,7 +446,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // changed.
   bool Special = MI->isCall() ||
     MI->hasExtraSrcRegAllocReq() ||
-    TII->isPredicated(MI);
+    TII->isPredicated(MI) || MI->isInlineAsm();
 
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 8b1bea8049e4..c2c0f84e5c92 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -27,15 +27,15 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   auto *Scope = cast<DIScope>(S);
   StringRef Dir = Scope->getDirectory(),
             Filename = Scope->getFilename();
-  char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
-  if (Result)
-    return Result;
+  std::string &Filepath =
+      DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
+  if (!Filepath.empty())
+    return Filepath;
 
   // Clang emits directory and relative filename info into the IR, but CodeView
   // operates on full paths.  We could change Clang to emit full paths too, but
   // that would increase the IR size and probably not needed for other users.
   // For now, just concatenate and canonicalize the path here.
-  std::string Filepath;
   if (Filename.find(':') == 1)
     Filepath = Filename;
   else
@@ -74,8 +74,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
     Filepath.erase(Cursor, 1);
 
-  Result = strdup(Filepath.c_str());
-  return StringRef(Result);
+  return Filepath;
 }
 
 void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index 43d1a432712e..78068e07c16f 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
     }
   } FileNameRegistry;
 
-  typedef std::map<std::pair<StringRef, StringRef>, char *>
+  typedef std::map<std::pair<StringRef, StringRef>, std::string>
       DirAndFilenameToFilepathMapTy;
   DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap;
   StringRef getFullFilepath(const MDNode *S);
@@ -116,14 +116,6 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
 public:
   WinCodeViewLineTables(AsmPrinter *Asm);
 
-  ~WinCodeViewLineTables() override {
-    for (DirAndFilenameToFilepathMapTy::iterator
-             I = DirAndFilenameToFilepathMap.begin(),
-             E = DirAndFilenameToFilepathMap.end();
-         I != E; ++I)
-      free(I->second);
-  }
-
   void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
 
   /// \brief Emit the COFF section that holds the line table information.
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index e7998db4a7c1..e4b7c5a62780 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -51,7 +51,9 @@ namespace {
     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
     bool expandAtomicStore(StoreInst *SI);
     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
+    bool expandAtomicOpToLLSC(
+        Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+        std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     bool isIdempotentRMW(AtomicRMWInst *AI);
     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
@@ -174,13 +176,15 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
-  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(
+        LI, LI->getPointerOperand(), LI->getOrdering(),
+        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
     return expandAtomicLoadToLL(LI);
-  }
-  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
     return expandAtomicLoadToCmpXchg(LI);
   }
-  }
   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
 }
 
@@ -192,6 +196,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
   Value *Val =
       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
 
   LI->replaceAllUsesWith(Val);
   LI->eraseFromParent();
@@ -245,20 +250,6 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
 }
 
-bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
-  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
-  case TargetLoweringBase::AtomicExpansionKind::None:
-    return false;
-  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
-    return expandAtomicRMWToLLSC(AI);
-  }
-  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
-    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
-  }
-  }
-  llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
-}
-
 /// Emit IR to implement the given atomicrmw operation on values in registers,
 /// returning the new value.
 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
@@ -296,10 +287,28 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
   }
 }
 
-bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
-  AtomicOrdering MemOpOrder = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
+bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
+                                [&](IRBuilder<> &Builder, Value *Loaded) {
+                                  return performAtomicOp(AI->getOperation(),
+                                                         Builder, Loaded,
+                                                         AI->getValOperand());
+                                });
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
+    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+  }
+}
+
+bool AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+    std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  BasicBlock *BB = I->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
 
@@ -317,11 +326,11 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   // atomicrmw.end:
   //     fence?
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
+  BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
+  // This grabs the DebugLoc from I.
+  IRBuilder<> Builder(I);
 
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place), but we might want a fence too. It's easiest to just remove
@@ -334,8 +343,7 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   Builder.SetInsertPoint(LoopBB);
   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
 
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+  Value *NewVal = PerformOp(Builder, Loaded);
 
   Value *StoreSuccess =
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
@@ -345,8 +353,8 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
 
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
 
   return true;
 }
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 0b2495cc996e..54d92ad67a97 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1099,13 +1099,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
   if (TailMBB.succ_size() <= 1)
     return;
 
-  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
-  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
   auto EdgeFreq = EdgeFreqLs.begin();
 
-  for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
-       SuccI != SuccE; ++SuccI, ++EdgeFreq)
-    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+  if (SumEdgeFreq > 0) {
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq) {
+      auto Prob = BranchProbability::getBranchProbability(
+          EdgeFreq->getFrequency(), SumEdgeFreq);
+      TailMBB.setSuccProbability(SuccI, Prob);
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 0f6e1463f10f..eae78a950d9a 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 0b2f3ea165f8..e90cb02bd280 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   return true;
 }
 
-/// Scale down weights to fit into uint32_t. NewTrue is the new weight
-/// for successor TrueBB, and NewFalse is the new weight for successor
-/// FalseBB.
-static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse,
-                         MachineBasicBlock *MBB,
-                         const MachineBasicBlock *TrueBB,
-                         const MachineBasicBlock *FalseBB,
-                         const MachineBranchProbabilityInfo *MBPI) {
-  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
-  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                        SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    if (*SI == TrueBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale));
-    else if (*SI == FalseBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale));
-    else
-      MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale);
-  }
-}
-
 /// IfConvertTriangle - If convert a triangle sub-CFG.
 ///
 bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
@@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
-  uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
-  uint32_t WeightScale = 0;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
 
   if (HasEarlyExit) {
-    // Get weights before modifying CvtBBI->BB and BBI.BB.
-    CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
-    CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB);
-    BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB);
-    BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
-    SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
+    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
   }
 
   if (CvtBBI->BB->pred_size() > 1) {
@@ -1266,22 +1243,23 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
+    //   Prob(BBI.BB, NextBBI->BB) +
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter =
+        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
+    if (NewTrueBBIter != BBI.BB->succ_end())
+      BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
-    BBI.BB->addSuccessor(CvtBBI->FalseBB);
-    // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
-    // New_Weight(BBI.BB, NextBBI->BB) =
-    //   Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) +
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB)
-    // New_Weight(BBI.BB, CvtBBI->FalseBB) =
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB)
-
-    uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale;
-    uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale;
-    // We need to scale down all weights of BBI.BB to fit uint32_t.
-    // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to
-    // the next block.
-    ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB),
-                 CvtBBI->FalseBB, MBPI);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
   }
 
   // Merge in the 'false' block if the 'false' block has no other
@@ -1524,7 +1502,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
-      BBI.BB->addSuccessor(TailBB);
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
@@ -1688,21 +1666,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
                                                 FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
-
-  // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when
+  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
   // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
-  uint32_t To2FromWeight = 0;
-  // WeightScale and SumWeight are for calculating successor probabilities of
-  // FromBBI.BB.
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = 0;
+  auto To2FromProb = BranchProbability::getZero();
   if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB);
-    // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge
-    // weight being merged to other edges when this edge is removed later.
-    ToBBI.BB->setSuccWeight(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0);
-    SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale);
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
+  }
+
+  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
   }
 
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
@@ -1711,39 +1694,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
     if (Succ == FallThrough)
       continue;
 
-    uint32_t NewWeight = 0;
+    auto NewProb = BranchProbability::getZero();
     if (AddEdges) {
-      // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is
-      // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio
-      // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a
-      // successor of ToBBI.BB. See comment below for excepion).
-      NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ);
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromBBI.BB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
+      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
 
-      // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
       // only happens when if-converting a diamond CFG and FromBBI.BB is the
       // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
-      // could just use the weights on FromBBI.BB's out-edges when adding new
-      // successors.
-      if (To2FromWeight > 0) {
-        BranchProbability Prob(NewWeight / WeightScale, SumWeight);
-        NewWeight = Prob.scale(To2FromWeight);
-      }
+      // could just use the probabilities on FromBBI.BB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
     }
 
     FromBBI.BB->removeSuccessor(Succ);
 
     if (AddEdges) {
-      // If the edge from ToBBI.BB to Succ already exists, update the weight of
-      // this edge by adding NewWeight to it. An example is shown below, in
-      // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to
-      // set C as A's successor as it already is. We only need to update the
-      // edge weight on A->C. Note that B will not be immediately removed from
-      // A's successors. It is possible that B->D is not removed either if D is
-      // a fallthrough of B. Later the edge A->D (generated here) and B->D will
-      // be combined into one edge. To maintain correct edge weight of this
-      // combined edge, we need to set the edge weight of A->B to zero, which is
-      // already done above. The edge weight on A->D is calculated by scaling
-      // the original weight on A->B by the probability of B->D.
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewWeight to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
       //
       // Before ifcvt:      After ifcvt (assume B->D is kept):
       //
@@ -1755,11 +1737,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
       //    C  D             C  D
       //
       if (ToBBI.BB->isSuccessor(Succ))
-        ToBBI.BB->setSuccWeight(
+        ToBBI.BB->setSuccProbability(
             std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
-            MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight);
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
       else
-        ToBBI.BB->addSuccessor(Succ, NewWeight);
+        ToBBI.BB->addSuccessor(Succ, NewProb);
     }
   }
 
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 5b895fff5c43..47a9f64e9080 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
     
+  case Intrinsic::get_dynamic_area_offset:
+    errs() << "WARNING: this target does not support the custom llvm.get."
+              "dynamic.area.offset.  It is being lowered to a constant 0\n";
+    // Just lower it to a constant 0 because for most targets
+    // @llvm.get.dynamic.area.offset is lowered to zero.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
+    break;
   case Intrinsic::returnaddress:
   case Intrinsic::frameaddress:
     errs() << "WARNING: this target does not support the llvm."
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 5b8c8258b285..da24cb17918b 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
   PM.add(new MachineFunctionAnalysis(*TM, MFInitializer));
 
   // Enable FastISel with -fast, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
       (TM->getOptLevel() == CodeGenOpt::None &&
-       EnableFastISelOption != cl::BOU_FALSE))
+       TM->getO0WantsFastISel()))
     TM->setFastISel(true);
 
   // Ask the target for an isel.
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 5a8e96df7603..c9c2d62cec30 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
       if (expectAndConsume(MIToken::rparen))
         return true;
     }
-    MBB.addSuccessor(SuccMBB, Weight);
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
   } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
   return false;
 }
 
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 0be7807064fb..175cb0d51437 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
       if (I != MBB.succ_begin())
         OS << ", ";
       printMBBReference(**I);
-      if (MBB.hasSuccessorWeights())
-        OS << '(' << MBB.getSuccWeight(I) << ')';
+      if (MBB.hasSuccessorProbabilities())
+        OS << '(' << MBB.getSuccProbability(I) << ')';
     }
     OS << "\n";
     HasLineAttributes = true;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 602b75182fca..de91f0db75a8 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "    Successors according to CFG:";
     for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
-      if (!Weights.empty())
-        OS << '(' << *getWeightIterator(SI) << ')';
+      if (!Probs.empty())
+        OS << '(' << *getProbabilityIterator(SI) << ')';
     }
     OS << '\n';
   }
@@ -506,35 +506,12 @@ void MachineBasicBlock::updateTerminator() {
   }
 }
 
-void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) {
-  // Weight list is either empty (if successor list isn't empty, this means
-  // disabled optimization) or has the same size as successor list.
-  if (!(Weights.empty() && !Successors.empty()))
-    Weights.push_back(Weight);
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
-void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) {
-  // We need to make sure weight list is either empty or has the same size of
-  // successor list. When this function is called, we can safely delete all
-  // weight in the list.
-  Weights.clear();
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
 void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
                                      BranchProbability Prob) {
   // Probability list is either empty (if successor list isn't empty, this means
   // disabled optimization) or has the same size as successor list.
-  if (!(Probs.empty() && !Successors.empty())) {
+  if (!(Probs.empty() && !Successors.empty()))
     Probs.push_back(Prob);
-    // FIXME: Temporarily use the numerator of the probability to represent edge
-    // weight. This will be removed once all weight-version interfaces in MBB
-    // are replaced with probability-version interfaces.
-    Weights.push_back(Prob.getNumerator());
-  }
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -544,7 +521,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
   // of successor list. When this function is called, we can safely delete all
   // probability in the list.
   Probs.clear();
-  Weights.clear();
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -558,23 +534,12 @@ MachineBasicBlock::succ_iterator
 MachineBasicBlock::removeSuccessor(succ_iterator I) {
   assert(I != Successors.end() && "Not a current successor!");
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(I);
-    Weights.erase(WI);
-  }
-
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // If probability list is empty it means we don't use it (disabled
   // optimization).
   if (!Probs.empty()) {
     probability_iterator WI = getProbabilityIterator(I);
     Probs.erase(WI);
   }
-#endif
 
   (*I)->removePredecessor(this);
   return Successors.erase(I);
@@ -611,17 +576,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   }
 
   // New is already a successor.
-  // Update its weight instead of adding a duplicate edge.
-  if (!Weights.empty())
-    *getWeightIterator(NewI) += *getWeightIterator(OldI);
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // Update its probability instead of adding a duplicate edge.
-  if (!Probs.empty())
-    *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI);
-#endif
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
+  }
   removeSuccessor(OldI);
 }
 
@@ -641,13 +601,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
 
-    // If Weight list is empty it means we don't use it (disabled optimization).
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
+    // If probability list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
 
-    addSuccessor(Succ, Weight);
     FromMBB->removeSuccessor(Succ);
   }
 }
@@ -659,10 +620,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
-    addSuccessor(Succ, Weight);
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -1146,80 +1108,51 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return DL;
 }
 
-/// Return weight of the edge from this block to MBB.
-uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
-  if (Weights.empty())
-    return 0;
-
-  return *getWeightIterator(Succ);
-}
-
-/// Return probability of the edge from this block to MBB. If probability list
-/// is empty, return a default probability which is 1/N, where N is the number
-/// of successors. If the probability of the given successor is unknown, then
-/// sum up all known probabilities and return the complement of the sum divided
-/// by the number of unknown probabilities.
+/// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
   if (Probs.empty())
     return BranchProbability(1, succ_size());
 
-  auto Prob = *getProbabilityIterator(Succ);
-  assert(!Prob.isUnknown());
-  return Prob;
-}
-
-/// Set successor weight of a given iterator.
-void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) {
-  if (Weights.empty())
-    return;
-  *getWeightIterator(I) = Weight;
+  const auto &Prob = *getProbabilityIterator(Succ);
+  if (Prob.isUnknown()) {
+    // For unknown probabilities, collect the sum of all known ones, and evenly
+    // ditribute the complemental of the sum to each unknown probability.
+    unsigned KnownProbNum = 0;
+    auto Sum = BranchProbability::getZero();
+    for (auto &P : Probs) {
+      if (!P.isUnknown()) {
+        Sum += P;
+        KnownProbNum++;
+      }
+    }
+    return Sum.getCompl() / (Probs.size() - KnownProbNum);
+  } else
+    return Prob;
 }
 
 /// Set successor probability of a given iterator.
 void MachineBasicBlock::setSuccProbability(succ_iterator I,
                                            BranchProbability Prob) {
   assert(!Prob.isUnknown());
-  if (Probs.empty() || Weights.empty())
+  if (Probs.empty())
     return;
   *getProbabilityIterator(I) = Prob;
-  // FIXME: Temporarily use the numerator of the probability to represent edge
-  // weight. This will be removed once all weight-version interfaces in MBB
-  // are replaces with probability-version interfaces.
-  *getWeightIterator(I) = Prob.getNumerator();
 }
 
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::succ_iterator I) {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
-}
-
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::const_weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  const size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
-}
-
-/// Return probability iterator corresonding to the I successor iterator.
-MachineBasicBlock::probability_iterator
-MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
   return Probs.begin() + index;
 }
 
-/// Return probability iterator corresonding to the I successor iterator
-MachineBasicBlock::const_probability_iterator
-MachineBasicBlock::getProbabilityIterator(
-    MachineBasicBlock::const_succ_iterator I) const {
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index fba33eb93d5f..fcddf346cf68 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   const BranchProbability HotProb(4, 5); // 80%
 
   MachineBasicBlock *BestSucc = nullptr;
-  // FIXME: Due to the performance of the probability and weight routines in
-  // the MBPI analysis, we manually compute probabilities using the edge
-  // weights. This is suboptimal as it means that the somewhat subtle
-  // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to efficiently support query patterns such as
-  // this.
-  uint32_t BestWeight = 0;
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-
-  // Adjust sum of weights by excluding weights on edges pointing to blocks that
-  // is either not in BlockFilter or is already in the current chain. Consider
-  // the following CFG:
+  auto BestProb = BranchProbability::getZero();
+
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
   //
   //     --->A
   //     |  / \
@@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   // HotProb). If we exclude E that is not in BlockFilter when calculating the
   // probability of C->D, D will be selected and we will get A C D B as the
   // layout of this loop.
-  uint32_t AdjustedSumWeight = SumWeight;
+  auto AdjustedSumProb = BranchProbability::getOne();
   SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
     bool SkipSucc = false;
@@ -424,15 +416,20 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       }
     }
     if (SkipSucc)
-      AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale;
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
     else
       Successors.push_back(Succ);
   }
 
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
   for (MachineBasicBlock *Succ : Successors) {
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-    BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight);
+    BranchProbability SuccProb;
+    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
+    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+    if (SuccProbN >= SuccProbD)
+      SuccProb = BranchProbability::getOne();
+    else
+      SuccProb = BranchProbability(SuccProbN, SuccProbD);
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other
@@ -470,7 +467,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
       // Make sure that a hot successor doesn't have a globally more
       // important predecessor.
-      BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight);
+      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
       BlockFrequency CandidateEdgeFreq =
           MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
@@ -496,10 +493,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
-    if (BestSucc && BestWeight >= SuccWeight)
+    if (BestSucc && BestProb >= SuccProb)
       continue;
     BestSucc = Succ;
-    BestWeight = SuccWeight;
+    BestProb = SuccProb;
   }
   return BestSucc;
 }
@@ -728,11 +725,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
     bool HasLoopingSucc = false;
-    // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights and manually compute the
-    // probabilities to avoid quadratic behavior.
-    uint32_t WeightScale = 0;
-    uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale);
     for (MachineBasicBlock *Succ : MBB->successors()) {
       if (Succ->isEHPad())
         continue;
@@ -746,10 +738,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
         continue;
       }
 
-      uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ);
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccWeight << ")\n");
+                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -761,7 +753,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
           BlocksExitingToOuterLoop.insert(MBB);
       }
 
-      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
@@ -904,21 +895,17 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // edge from the tail of the loop chain.
   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
   for (auto BB : LoopChain) {
-    uint32_t LargestExitEdgeWeight = 0;
+    auto LargestExitEdgeProb = BranchProbability::getZero();
     for (auto *Succ : BB->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
           (!SuccChain || Succ == *SuccChain->begin())) {
-        uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-        LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight);
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
       }
     }
-    if (LargestExitEdgeWeight > 0) {
-      uint32_t WeightScale = 0;
-      uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-      auto ExitFreq =
-          MBFI->getBlockFreq(BB) *
-          BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight);
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
       ExitsWithFreq.emplace_back(BB, ExitFreq);
     }
   }
@@ -1290,14 +1277,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       }
 
       // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher weight first.
+      // such that we branch to the successor with higher probability first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          MBPI->getEdgeProbability(PrevBB, FBB) >
+              MBPI->getEdgeProbability(PrevBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
                      << getBlockName(PrevBB) << "\n");
-        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
-                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DEBUG(dbgs() << "    Edge probability: "
+                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 6fbc2be70486..5478dcba261a 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0;
 
 void MachineBranchProbabilityInfo::anchor() { }
 
-uint32_t MachineBranchProbabilityInfo::
-getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
-  // First we compute the sum with 64-bits of precision, ensuring that cannot
-  // overflow by bounding the number of weights considered. Hopefully no one
-  // actually needs 2^32 successors.
-  assert(MBB->succ_size() < UINT32_MAX);
-  uint64_t Sum = 0;
-  Scale = 1;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight;
-  }
-
-  // If the computed sum fits in 32-bits, we're done.
-  if (Sum <= UINT32_MAX)
-    return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst).getNumerator();
+}
 
-  // Otherwise, compute the scale necessary to cause the weights to fit, and
-  // re-sum with that scale applied.
-  assert((Sum / UINT32_MAX) < UINT32_MAX);
-  Scale = (Sum / UINT32_MAX) + 1;
-  Sum = 0;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight / Scale;
-  }
-  assert(Sum <= UINT32_MAX);
-  return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              MachineBasicBlock::const_succ_iterator Dst) const {
-  uint32_t Weight = Src->getSuccWeight(Dst);
-  if (!Weight)
-    return DEFAULT_WEIGHT;
-  return Weight;
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              const MachineBasicBlock *Dst) const {
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
   // This is a linear search. Try to use the const_succ_iterator version when
   // possible.
-  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  return getEdgeProbability(Src,
+                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
 bool
 MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
-  // FIXME: Compare against a static "hot" BranchProbability.
-  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+  static BranchProbability HotProb(4, 5);
+  return getEdgeProbability(Src, Dst) > HotProb;
 }
 
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
-  uint32_t MaxWeight = 0;
+  auto MaxProb = BranchProbability::getZero();
   MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    if (Weight > MaxWeight) {
-      MaxWeight = Weight;
+    auto Prob = getEdgeProbability(MBB, I);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
       MaxSucc = *I;
     }
   }
 
-  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
+  static BranchProbability HotProb(4, 5);
+  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
     return MaxSucc;
 
   return nullptr;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
-  uint32_t Scale = 1;
-  uint32_t D = getSumForBlock(Src, Scale);
-  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
-
-  return BranchProbability(N, D);
-}
-
 raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
     raw_ostream &OS, const MachineBasicBlock *Src,
     const MachineBasicBlock *Dst) const {
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 7e6af1c9c41f..80d30a5b131a 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionInitializer.h"
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 18efcf39c453..1956a701d8e6 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -10,7 +10,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index abf9b4d67696..03c82f46da63 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -27,12 +27,11 @@ void MachineRegisterInfo::Delegate::anchor() {}
 MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
   : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
     TracksSubRegLiveness(false) {
+  unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
-
-  // Create the physreg use/def lists.
-  PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr);
+  UsedPhysRegMask.resize(NumRegs);
+  PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index e1020772629c..cdcd8eb4fbdf 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -28,7 +28,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 6ff16e551be2..6e7feb5178ee 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -59,12 +59,12 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
   dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
-  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveInRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveInRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
   dbgs() << "Live Out: ";
-  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveOutRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveOutRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
 }
 
@@ -92,8 +92,8 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
 void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
-    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+  for (unsigned RegUnit : RegUnits) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnit);
     unsigned Weight = PSetI.getWeight();
     for (; PSetI.isValid(); ++PSetI) {
       CurrSetPressure[*PSetI] += Weight;
@@ -106,8 +106,8 @@ void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
 
 /// Simply decrease the current pressure as impacted by these registers.
 void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
-    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
+  for (unsigned RegUnit : RegUnits)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -298,8 +298,7 @@ void RegPressureTracker::closeRegion() {
 void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
-  for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) {
-    unsigned Reg = P.LiveOutRegs[i];
+  for (unsigned Reg : P.LiveOutRegs) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)
         && !RPTracker.hasUntiedDef(Reg)) {
       increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
@@ -314,71 +313,88 @@ static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
 }
 
 namespace {
-/// Collect this instruction's unique uses and defs into SmallVectors for
-/// processing defs and uses in order.
-///
-/// FIXME: always ignore tied opers
-class RegisterOperands {
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  bool IgnoreDead;
 
+/// List of register defined and used by a machine instruction.
+class RegisterOperands {
 public:
   SmallVector<unsigned, 8> Uses;
   SmallVector<unsigned, 8> Defs;
   SmallVector<unsigned, 8> DeadDefs;
 
-  RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri, bool ID = false):
-    TRI(tri), MRI(mri), IgnoreDead(ID) {}
+  void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI,
+               const MachineRegisterInfo &MRI, bool IgnoreDead = false);
+};
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
+class RegisterOperandsCollector {
+  RegisterOperands &RegOpers;
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+  bool IgnoreDead;
 
-  /// Push this operand's register onto the correct vector.
-  void collect(const MachineOperand &MO) {
+  RegisterOperandsCollector(RegisterOperands &RegOpers,
+                            const TargetRegisterInfo &TRI,
+                            const MachineRegisterInfo &MRI,
+                            bool IgnoreDead)
+    : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
+
+  void collectInstr(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI)
+      collectOperand(*OperI);
+
+    // Remove redundant physreg dead defs.
+    SmallVectorImpl<unsigned>::iterator I =
+      std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
+                     std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
+    RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+  }
+
+  /// Push this operand's register onto the correct vectors.
+  void collectOperand(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
+    unsigned Reg = MO.getReg();
     if (MO.readsReg())
-      pushRegUnits(MO.getReg(), Uses);
+      pushRegUnits(Reg, RegOpers.Uses);
     if (MO.isDef()) {
       if (MO.isDead()) {
         if (!IgnoreDead)
-          pushRegUnits(MO.getReg(), DeadDefs);
-      }
-      else
-        pushRegUnits(MO.getReg(), Defs);
+          pushRegUnits(Reg, RegOpers.DeadDefs);
+      } else
+        pushRegUnits(Reg, RegOpers.Defs);
     }
   }
 
-protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) const {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       if (containsReg(RegUnits, Reg))
         return;
       RegUnits.push_back(Reg);
-    }
-    else if (MRI->isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
         if (containsReg(RegUnits, *Units))
           continue;
         RegUnits.push_back(*Units);
       }
     }
   }
+
+  friend class RegisterOperands;
 };
-} // namespace
 
-/// Collect physical and virtual register operands.
-static void collectOperands(const MachineInstr *MI,
-                            RegisterOperands &RegOpers) {
-  for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
-    RegOpers.collect(*OperI);
-
-  // Remove redundant physreg dead defs.
-  SmallVectorImpl<unsigned>::iterator I =
-    std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
-                   std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
-  RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+void RegisterOperands::collect(const MachineInstr &MI,
+                               const TargetRegisterInfo &TRI,
+                               const MachineRegisterInfo &MRI,
+                               bool IgnoreDead) {
+  RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
+  Collector.collectInstr(MI);
 }
 
+} // namespace
+
 /// Initialize an array of N PressureDiffs.
 void PressureDiffs::init(unsigned N) {
   Size = N;
@@ -432,18 +448,18 @@ static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
                          const MachineRegisterInfo *MRI) {
   assert(!PDiff.begin()->isValid() && "stale PDiff");
 
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
+  for (unsigned Reg : RegOpers.Defs)
+    PDiff.addPressureChange(Reg, true, MRI);
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
+  for (unsigned Reg : RegOpers.Uses)
+    PDiff.addPressureChange(Reg, false, MRI);
 }
 
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    if (LiveRegs.insert(Regs[i]))
-      increaseRegPressure(Regs[i]);
+  for (unsigned Reg : Regs) {
+    if (LiveRegs.insert(Reg))
+      increaseRegPressure(Reg);
   }
 }
 
@@ -474,13 +490,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 /// registers that are both defined and used by the instruction.  If a pressure
 /// difference pointer is provided record the changes is pressure caused by this
 /// instruction independent of liveness.
-bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
+void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
                                 PressureDiff *PDiff) {
-  // Check for the top of the analyzable region.
-  if (CurrPos == MBB->begin()) {
-    closeRegion();
-    return false;
-  }
+  assert(CurrPos != MBB->begin());
   if (!isBottomClosed())
     closeBottom();
 
@@ -492,11 +504,8 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   do
     --CurrPos;
   while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+  assert(!CurrPos->isDebugValue());
 
-  if (CurrPos->isDebugValue()) {
-    closeRegion();
-    return false;
-  }
   SlotIndex SlotIdx;
   if (RequireIntervals)
     SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
@@ -505,8 +514,8 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   if (RequireIntervals && isTopClosed())
     static_cast<IntervalPressure&>(P).openTop(SlotIdx);
 
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*CurrPos, *TRI, *MRI);
 
   if (PDiff)
     collectPDiff(*PDiff, RegOpers, MRI);
@@ -517,8 +526,7 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
 
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     bool DeadDef = false;
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
@@ -542,8 +550,7 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   }
 
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
@@ -561,24 +568,18 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
     }
   }
   if (TrackUntiedDefs) {
-    for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-      unsigned Reg = RegOpers.Defs[i];
+    for (unsigned Reg : RegOpers.Defs) {
       if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg))
         UntiedDefs.insert(Reg);
     }
   }
-  return true;
 }
 
 /// Advance across the current instruction.
-bool RegPressureTracker::advance() {
+void RegPressureTracker::advance() {
   assert(!TrackUntiedDefs && "unsupported mode");
 
-  // Check for the bottom of the analyzable region.
-  if (CurrPos == MBB->end()) {
-    closeRegion();
-    return false;
-  }
+  assert(CurrPos != MBB->end());
   if (!isTopClosed())
     closeTop();
 
@@ -594,11 +595,10 @@ bool RegPressureTracker::advance() {
       static_cast<RegionPressure&>(P).openBottom(CurrPos);
   }
 
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*CurrPos, *TRI, *MRI);
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     // Discover live-ins.
     bool isLive = LiveRegs.contains(Reg);
     if (!isLive)
@@ -608,22 +608,19 @@ bool RegPressureTracker::advance() {
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
       lastUse = LR && LR->Query(SlotIdx).isKill();
-    }
-    else {
+    } else {
       // Allocatable physregs are always single-use before register rewriting.
       lastUse = !TargetRegisterInfo::isVirtualRegister(Reg);
     }
     if (lastUse && isLive) {
       LiveRegs.erase(Reg);
       decreaseRegPressure(Reg);
-    }
-    else if (!lastUse && !isLive)
+    } else if (!lastUse && !isLive)
       increaseRegPressure(Reg);
   }
 
   // Generate liveness for defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     if (LiveRegs.insert(Reg))
       increaseRegPressure(Reg);
   }
@@ -636,7 +633,6 @@ bool RegPressureTracker::advance() {
   do
     ++CurrPos;
   while (CurrPos != MBB->end() && CurrPos->isDebugValue());
-  return true;
 }
 
 /// Find the max change in excess pressure across all sets.
@@ -662,8 +658,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
         PDiff = 0;            // Under the limit
       else
         PDiff = PNew - Limit; // Just exceeded limit.
-    }
-    else if (Limit > PNew)
+    } else if (Limit > PNew)
       PDiff = Limit - POld;   // Just obeyed limit.
 
     if (PDiff) {
@@ -728,17 +723,12 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
-  collectOperands(MI, RegOpers);
-
-  // Boost max pressure for all dead defs together.
-  // Since CurrSetPressure and MaxSetPressure
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
+  assert(RegOpers.DeadDefs.size() == 0);
 
   // Kill liveness at live defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     bool DeadDef = false;
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
@@ -754,8 +744,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
     }
   }
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg))
       increaseRegPressure(Reg);
   }
@@ -923,8 +912,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(MI, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI);
 
   // Kill liveness at last uses. Assume allocatable physregs are single-use
   // rather than checking LiveIntervals.
@@ -932,8 +921,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   if (RequireIntervals)
     SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (RequireIntervals) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
@@ -944,8 +932,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
         if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))
           decreaseRegPressure(Reg);
       }
-    }
-    else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
       // Allocatable physregs are always single-use before register rewriting.
       decreaseRegPressure(Reg);
     }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c5810525f3c7..8238cdeb59ca 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1213,6 +1213,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STACKSAVE:
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    break;
   case ISD::VAARG:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getValueType(0));
@@ -3295,6 +3299,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Node->getOperand(0));
     }
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(0));
+    break;
   case ISD::FCOPYSIGN:
     Results.push_back(ExpandFCOPYSIGN(Node));
     break;
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index e9bd52034ffd..78985e01ef9a 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -141,8 +141,8 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
   /// that are "live". These nodes must be scheduled before any other nodes that
   /// modifies the registers can be scheduled.
   unsigned NumLiveRegs;
-  std::vector<SUnit*> LiveRegDefs;
-  std::vector<SUnit*> LiveRegGens;
+  std::unique_ptr<SUnit*[]> LiveRegDefs;
+  std::unique_ptr<SUnit*[]> LiveRegGens;
 
   // Collect interferences between physical register use/defs.
   // Each interference is an SUnit and set of physical registers.
@@ -328,8 +328,8 @@ void ScheduleDAGRRList::Schedule() {
   NumLiveRegs = 0;
   // Allocate slots for each physical register, plus one for a special register
   // to track the virtual resource of a calling sequence.
-  LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr);
-  LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr);
+  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
   CallSeqEndForStart.clear();
   assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
 
@@ -1218,7 +1218,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 /// CheckForLiveRegDef - Return true and update live register vector if the
 /// specified register def of the specified SUnit clobbers any "live" registers.
 static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
-                               std::vector<SUnit*> &LiveRegDefs,
+                               SUnit **LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVectorImpl<unsigned> &LRegs,
                                const TargetRegisterInfo *TRI) {
@@ -1240,7 +1240,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
 /// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
 /// by RegMask, and add them to LRegs.
 static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
-                                     std::vector<SUnit*> &LiveRegDefs,
+                                     ArrayRef<SUnit*> LiveRegDefs,
                                      SmallSet<unsigned, 4> &RegAdded,
                                      SmallVectorImpl<unsigned> &LRegs) {
   // Look at all live registers. Skip Reg0 and the special CallResource.
@@ -1278,7 +1278,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
-      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(),
                          RegAdded, LRegs, TRI);
   }
 
@@ -1302,7 +1302,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
             if (TargetRegisterInfo::isPhysicalRegister(Reg))
-              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
           }
         } else
           i += NumVals;
@@ -1328,13 +1328,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
       }
     }
     if (const uint32_t *RegMask = getNodeRegMask(Node))
-      CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs);
+      CheckForLiveRegDefMasked(SU, RegMask,
+                               makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
+                               RegAdded, LRegs);
 
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
     for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
-      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
   }
 
   return !LRegs.empty();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f754e24e3231..85e7e3c1bc8c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3301,18 +3301,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
 // extract the spalt value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 //
-static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index,
+static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index,
                            SelectionDAGBuilder* SDB) {
 
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
   assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP || GEP->getNumOperands() > 2)
     return false;
 
-  Value *GEPPtr = GEP->getPointerOperand();
+  const Value *GEPPtr = GEP->getPointerOperand();
   if (!GEPPtr->getType()->isVectorTy())
     Ptr = GEPPtr;
   else if (!(Ptr = getSplatValue(GEPPtr)))
@@ -3348,7 +3348,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
-  Value  *Ptr = I.getArgOperand(1);
+  const Value *Ptr = I.getArgOperand(1);
   SDValue Src0 = getValue(I.getArgOperand(0));
   SDValue Mask = getValue(I.getArgOperand(3));
   EVT VT = Src0.getValueType();
@@ -3362,10 +3362,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
 
-  Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
+  const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
@@ -3425,7 +3425,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
-  Value  *Ptr = I.getArgOperand(0);
+  const Value *Ptr = I.getArgOperand(0);
   SDValue Src0 = getValue(I.getArgOperand(3));
   SDValue Mask = getValue(I.getArgOperand(2));
 
@@ -3442,7 +3442,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
@@ -4463,22 +4463,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         Address = BCI->getOperand(0);
       // Parameters are handled specially.
       bool isParameter = Variable->isParameter() || isa<Argument>(Address);
-
-      const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
-
-      if (isParameter && !AI) {
-        FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
-        if (FINode)
-          // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getFrameIndexDbgValue(
-              Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder);
-        else {
-          // Address is an argument, so try to emit its dbg value using
-          // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
-                                   N);
-          return nullptr;
-        }
+      auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+      if (isParameter && FINode) {
+        // Byval parameter. We have a frame index at this point.
+        SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
+                                        FINode->getIndex(), 0, dl, SDNodeOrder);
+      } else if (isa<Argument>(Address)) {
+        // Address is an argument, so try to emit its dbg value using
+        // virtual register info from the FuncInfo.ValueMap.
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+                                 N);
+        return nullptr;
       } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, 0, dl, SDNodeOrder);
@@ -4933,6 +4928,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
     return nullptr;
   }
+  case Intrinsic::get_dynamic_area_offset: {
+    SDValue Op = getRoot();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
+    // target.
+    if (PtrTy != ResTy)
+      report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
+                         " intrinsic!");
+    Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
+                      Op);
+    DAG.setRoot(Op);
+    setValue(&I, Res);
+    return nullptr;
+  }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 195b48498605..a6f9699bb29c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::LIFETIME_END:               return "lifetime.end";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
+  case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
   case ISD::BITREVERSE:                 return "bitreverse";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ebf071cb9946..f6c5d90f47ae 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -264,13 +264,17 @@ namespace llvm {
         return;
       IS.OptLevel = NewOptLevel;
       IS.TM.setOptLevel(NewOptLevel);
-      SavedFastISel = IS.TM.Options.EnableFastISel;
-      if (NewOptLevel == CodeGenOpt::None)
-        IS.TM.setFastISel(true);
       DEBUG(dbgs() << "\nChanging optimization level for Function "
             << IS.MF->getFunction()->getName() << "\n");
       DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
             << " ; After: -O" << NewOptLevel << "\n");
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None) {
+        IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
+        DEBUG(dbgs() << "\tFastISel is "
+              << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+              << "\n");
+      }
     }
 
     ~OptLevelChanger() {
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index ff86dabfac59..1f5b54866ac6 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (PredTBB)
       TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
 
-    uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB);
+    auto Prob = MBPI->getEdgeProbability(PredBB, TailBB);
     PredBB->removeSuccessor(TailBB);
     unsigned NumSuccessors = PredBB->succ_size();
     assert(NumSuccessors <= 1);
     if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
-      PredBB->addSuccessor(NewTarget, Weight);
+      PredBB->addSuccessor(NewTarget, Prob);
 
     TDBBs.push_back(PredBB);
   }
@@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
            E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I));
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
 
     Changed = true;
     ++NumTailDups;
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e348095aa8fc..69c130809bb8 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -840,6 +840,9 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
+
+    // For most targets @llvm.get.dynamic.area.offest just returns 0.
+    setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c96c813b0c9b..c6bae2434586 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -83,21 +83,20 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // The current basic block being processed.
   MachineBasicBlock *MBB;
 
-  // DistanceMap - Keep track the distance of a MI from the start of the
-  // current basic block.
+  // Keep track the distance of a MI from the start of the current basic block.
   DenseMap<MachineInstr*, unsigned> DistanceMap;
 
   // Set of already processed instructions in the current block.
   SmallPtrSet<MachineInstr*, 8> Processed;
 
-  // SrcRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies from physical registers to
-  // virtual registers. e.g. v1024 = move r0.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies from physical registers to virtual
+  // registers. e.g. v1024 = move r0.
   DenseMap<unsigned, unsigned> SrcRegMap;
 
-  // DstRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies to physical registers from
-  // virtual registers. e.g. r1 = move v1024.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies to physical registers from virtual
+  // registers. e.g. r1 = move v1024.
   DenseMap<unsigned, unsigned> DstRegMap;
 
   bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
@@ -165,7 +164,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  /// runOnMachineFunction - Pass entry point.
+  /// Pass entry point.
   bool runOnMachineFunction(MachineFunction&) override;
 };
 } // end anonymous namespace
@@ -181,10 +180,9 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
 
-/// sink3AddrInstruction - A two-address instruction has been converted to a
-/// three-address instruction to avoid clobbering a register. Try to sink it
-/// past the instruction that would kill the above mentioned register to reduce
-/// register pressure.
+/// A two-address instruction has been converted to a three-address instruction
+/// to avoid clobbering a register. Try to sink it past the instruction that
+/// would kill the above mentioned register to reduce register pressure.
 bool TwoAddressInstructionPass::
 sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
                      MachineBasicBlock::iterator OldPos) {
@@ -313,8 +311,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   return true;
 }
 
-/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg
-/// in current BB.
+/// Return the MachineInstr* if it is the single def of the Reg in current BB.
 static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
                                   const MachineRegisterInfo *MRI) {
   MachineInstr *Ret = nullptr;
@@ -352,10 +349,10 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
   return false;
 }
 
-/// noUseAfterLastDef - Return true if there are no intervening uses between the
-/// last instruction in the MBB that defines the specified register and the
-/// two-address instruction which is being processed. It also returns the last
-/// def location by reference
+/// Return true if there are no intervening uses between the last instruction
+/// in the MBB that defines the specified register and the two-address
+/// instruction which is being processed. It also returns the last def location
+/// by reference.
 bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
                                                   unsigned &LastDef) {
   LastDef = 0;
@@ -376,9 +373,9 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
   return !(LastUse > LastDef && LastUse < Dist);
 }
 
-/// isCopyToReg - Return true if the specified MI is a copy instruction or
-/// a extract_subreg instruction. It also returns the source and destination
-/// registers and whether they are physical registers by reference.
+/// Return true if the specified MI is a copy instruction or an extract_subreg
+/// instruction. It also returns the source and destination registers and
+/// whether they are physical registers by reference.
 static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
                         unsigned &SrcReg, unsigned &DstReg,
                         bool &IsSrcPhys, bool &IsDstPhys) {
@@ -398,8 +395,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   return true;
 }
 
-/// isPLainlyKilled - Test if the given register value, which is used by the
-// given instruction, is killed by the given instruction.
+/// Test if the given register value, which is used by the
+/// given instruction, is killed by the given instruction.
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
   if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
@@ -425,7 +422,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
   return MI->killsRegister(Reg);
 }
 
-/// isKilled - Test if the given register value, which is used by the given
+/// Test if the given register value, which is used by the given
 /// instruction, is killed by the given instruction. This looks through
 /// coalescable copies to see if the original value is potentially not killed.
 ///
@@ -473,8 +470,8 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
   }
 }
 
-/// isTwoAddrUse - Return true if the specified MI uses the specified register
-/// as a two-address use. If so, return the destination register by reference.
+/// Return true if the specified MI uses the specified register as a two-address
+/// use. If so, return the destination register by reference.
 static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -489,8 +486,8 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findOnlyInterestingUse - Given a register, if has a single in-basic block
-/// use, return the use instruction if it's a copy or a two-address use.
+/// Given a register, if has a single in-basic block use, return the use
+/// instruction if it's a copy or a two-address use.
 static
 MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
                                      MachineRegisterInfo *MRI,
@@ -517,8 +514,8 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   return nullptr;
 }
 
-/// getMappedReg - Return the physical register the specified virtual register
-/// might be mapped to.
+/// Return the physical register the specified virtual register might be mapped
+/// to.
 static unsigned
 getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   while (TargetRegisterInfo::isVirtualRegister(Reg))  {
@@ -532,8 +529,7 @@ getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   return 0;
 }
 
-/// regsAreCompatible - Return true if the two registers are equal or aliased.
-///
+/// Return true if the two registers are equal or aliased.
 static bool
 regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   if (RegA == RegB)
@@ -544,8 +540,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 
-/// isProfitableToCommute - Return true if it's potentially profitable to commute
-/// the two-address instruction that's being processed.
+/// Return true if it's potentially profitable to commute the two-address
+/// instruction that's being processed.
 bool
 TwoAddressInstructionPass::
 isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
@@ -643,9 +639,8 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   return LastDefB && LastDefC && LastDefC > LastDefB;
 }
 
-/// commuteInstruction - Commute a two-address instruction and update the basic
-/// block, distance map, and live variables if needed. Return true if it is
-/// successful.
+/// Commute a two-address instruction and update the basic block, distance map,
+/// and live variables if needed. Return true if it is successful.
 bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned RegBIdx,
                                                    unsigned RegCIdx,
@@ -674,8 +669,8 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
   return true;
 }
 
-/// isProfitableToConv3Addr - Return true if it is profitable to convert the
-/// given 2-address instruction to a 3-address one.
+/// Return true if it is profitable to convert the given 2-address instruction
+/// to a 3-address one.
 bool
 TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   // Look for situations like this:
@@ -691,8 +686,8 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
 }
 
-/// convertInstTo3Addr - Convert the specified two-address instruction into a
-/// three address one. Return true if this transformation was successful.
+/// Convert the specified two-address instruction into a three address one.
+/// Return true if this transformation was successful.
 bool
 TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                                               MachineBasicBlock::iterator &nmi,
@@ -733,8 +728,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// scanUses - Scan forward recursively for only uses, update maps if the use
-/// is a copy or a two-address instruction.
+/// Scan forward recursively for only uses, update maps if the use is a copy or
+/// a two-address instruction.
 void
 TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   SmallVector<unsigned, 4> VirtRegPairs;
@@ -780,8 +775,8 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   }
 }
 
-/// processCopy - If the specified instruction is not yet processed, process it
-/// if it's a copy. For a copy instruction, we find the physical registers the
+/// If the specified instruction is not yet processed, process it if it's a
+/// copy. For a copy instruction, we find the physical registers the
 /// source and destination registers might be mapped to. These are kept in
 /// point-to maps used to determine future optimizations. e.g.
 /// v1024 = mov r0
@@ -816,9 +811,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
   return;
 }
 
-/// rescheduleMIBelowKill - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
-/// instruction in order to eliminate the need for the copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the instruction below the kill instruction in order to
+/// eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -874,8 +869,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Uses;
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -917,8 +911,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
         OtherMI->isBranch() || OtherMI->isTerminator())
       // Don't move pass calls, etc.
       return false;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -987,8 +980,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// isDefTooClose - Return true if the re-scheduling will put the given
-/// instruction too close to the defs of its register dependencies.
+/// Return true if the re-scheduling will put the given instruction too close
+/// to the defs of its register dependencies.
 bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
                                               MachineInstr *MI) {
   for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
@@ -1007,10 +1000,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
   return false;
 }
 
-/// rescheduleKillAboveMI - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the
-/// current two-address instruction in order to eliminate the need for the
-/// copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the kill instruction above the current two-address
+/// instruction in order to eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -1058,8 +1050,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
   SmallSet<unsigned, 2> LiveDefs;
-  for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = KillMI->getOperand(i);
+  for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -1097,8 +1088,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       // Don't move pass calls, etc.
       return false;
     SmallVector<unsigned, 2> OtherDefs;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -1181,7 +1171,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
   unsigned OtherOpIdx = MI->getDesc().getNumDefs();
   for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
     // The call of findCommutedOpIndices below only checks if BaseOpIdx
-    // and OtherOpIdx are commutable, it does not really searches for
+    // and OtherOpIdx are commutable, it does not really search for
     // other commutable operands and does not change the values of passed
     // variables.
     if (OtherOpIdx == BaseOpIdx ||
@@ -1213,13 +1203,13 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
   return false;
 }
 
-/// tryInstructionTransform - For the case where an instruction has a single
-/// pair of tied register operands, attempt some transformations that may
-/// either eliminate the tied operands or improve the opportunities for
-/// coalescing away the register copy.  Returns true if no copy needs to be
-/// inserted to untie mi's operands (either because they were untied, or
-/// because mi was rescheduled, and will be visited again later). If the
-/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
+/// For the case where an instruction has a single pair of tied register
+/// operands, attempt some transformations that may either eliminate the tied
+/// operands or improve the opportunities for coalescing away the register copy.
+/// Returns true if no copy needs to be inserted to untie mi's operands
+/// (either because they were untied, or because mi was rescheduled, and will
+/// be visited again later). If the shouldOnlyCommute flag is true, only
+/// instruction commutation is attempted.
 bool TwoAddressInstructionPass::
 tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
@@ -1570,8 +1560,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   if (AllUsesCopied) {
     if (!IsEarlyClobber) {
       // Replace other (un-tied) uses of regB with LastCopiedReg.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+      for (MachineOperand &MO : MI->operands()) {
         if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB &&
             MO.isUse()) {
           if (MO.isKill()) {
@@ -1608,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     // regB is still used in this instruction, but a kill flag was
     // removed from a different tied use of regB, so now we need to add
     // a kill flag to one of the remaining uses of regB.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (MachineOperand &MO : MI->operands()) {
       if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
         MO.setIsKill(true);
         break;
@@ -1618,8 +1606,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 }
 
-/// runOnMachineFunction - Reduce two-address instructions to two operands.
-///
+/// Reduce two-address instructions to two operands.
 bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index f3f4e3be389e..dee4b870434e 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 2165d353ba09..a4195b75c47d 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -163,20 +163,12 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
 
   if (DumpType == DIDT_All || DumpType == DIDT_CUIndex) {
     OS << "\n.debug_cu_index contents:\n";
-    DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(),
-                              savedAddressByteSize);
-    DWARFUnitIndex CUIndex;
-    if (CUIndex.parse(CUIndexData))
-      CUIndex.dump(OS);
+    getCUIndex().dump(OS);
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_TUIndex) {
     OS << "\n.debug_tu_index contents:\n";
-    DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(),
-                              savedAddressByteSize);
-    DWARFUnitIndex TUIndex;
-    if (TUIndex.parse(TUIndexData))
-      TUIndex.dump(OS);
+    getTUIndex().dump(OS);
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
@@ -280,7 +272,7 @@ const DWARFUnitIndex &DWARFContext::getCUIndex() {
 
   DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0);
 
-  CUIndex = llvm::make_unique<DWARFUnitIndex>();
+  CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
   CUIndex->parse(CUIndexData);
   return *CUIndex;
 }
@@ -291,7 +283,7 @@ const DWARFUnitIndex &DWARFContext::getTUIndex() {
 
   DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0);
 
-  TUIndex = llvm::make_unique<DWARFUnitIndex>();
+  TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
   TUIndex->parse(TUIndexData);
   return *TUIndex;
 }
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index baefed30f368..1f1921649b57 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -30,6 +30,18 @@ void DWARFUnitIndex::Header::dump(raw_ostream &OS) const {
 }
 
 bool DWARFUnitIndex::parse(DataExtractor IndexData) {
+  bool b = parseImpl(IndexData);
+  if (!b) {
+    // Make sure we don't try to dump anything
+    Header.NumBuckets = 0;
+    // Release any partially initialized data.
+    ColumnKinds.reset();
+    Rows.reset();
+  }
+  return b;
+}
+
+bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
   uint32_t Offset = 0;
   if (!Header.parse(IndexData, &Offset))
     return false;
@@ -62,7 +74,7 @@ bool DWARFUnitIndex::parse(DataExtractor IndexData) {
   // Read the Column Headers
   for (unsigned i = 0; i != Header.NumColumns; ++i) {
     ColumnKinds[i] = static_cast<DWARFSectionKind>(IndexData.getU32(&Offset));
-    if (ColumnKinds[i] == DW_SECT_INFO || ColumnKinds[i] == DW_SECT_TYPES) {
+    if (ColumnKinds[i] == InfoColumnKind) {
       if (InfoColumn != -1)
         return false;
       InfoColumn = i;
@@ -107,6 +119,9 @@ StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) {
 }
 
 void DWARFUnitIndex::dump(raw_ostream &OS) const {
+  if (!Header.NumBuckets)
+    return;
+
   Header.dump(OS);
   OS << "Index Signature         ";
   for (unsigned i = 0; i != Header.NumColumns; ++i)
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 7aea169b7ae9..9c52a4dbe774 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -238,7 +238,11 @@ void Fuzzer::RunOneAndUpdateCorpus(Unit &U) {
 }
 
 void Fuzzer::ExecuteCallback(const Unit &U) {
-  int Res = USF.TargetFunction(U.data(), U.size());
+  const uint8_t *Data = U.data();
+  uint8_t EmptyData;
+  if (!Data) 
+    Data = &EmptyData;
+  int Res = USF.TargetFunction(Data, U.size());
   (void)Res;
   assert(Res == 0);
 }
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 1e02af149ad8..85e8706f11cc 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -38,6 +38,9 @@ set(UninstrumentedTests
   UninstrumentedTest
   )
 
+set(TraceBBTests
+  SimpleTest
+  )
 
 set(TestBinaries)
 
@@ -99,6 +102,11 @@ foreach(Test ${UninstrumentedTests})
   set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-Uninstrumented)
 endforeach()
 
+add_subdirectory(trace-bb)
+
+foreach(Test ${TraceBBTests})
+  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-TraceBB)
+endforeach()
 
 set_target_properties(${TestBinaries}
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/lib/Fuzzer/test/SimpleTest.cpp b/lib/Fuzzer/test/SimpleTest.cpp
index 6811d115d960..04225a889f5d 100644
--- a/lib/Fuzzer/test/SimpleTest.cpp
+++ b/lib/Fuzzer/test/SimpleTest.cpp
@@ -1,4 +1,5 @@
 // Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <assert.h>
 #include <cstdint>
 #include <cstdlib>
 #include <cstddef>
@@ -7,6 +8,7 @@
 static volatile int Sink;
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
   if (Size > 0 && Data[0] == 'H') {
     Sink = 1;
     if (Size > 1 && Data[1] == 'i') {
diff --git a/lib/Fuzzer/test/trace-bb/CMakeLists.txt b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
new file mode 100644
index 000000000000..99af019565b5
--- /dev/null
+++ b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
@@ -0,0 +1,14 @@
+# These tests are not instrumented with coverage.
+
+set(CMAKE_CXX_FLAGS_RELEASE
+  "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=edge,trace-bb")
+
+foreach(Test ${TraceBBTests})
+  add_executable(LLVMFuzzer-${Test}-TraceBB
+    ../${Test}.cpp
+    )
+  target_link_libraries(LLVMFuzzer-${Test}-TraceBB
+    LLVMFuzzer
+    )
+endforeach()
+
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index bdefe5917fef..e9626ba7e3bc 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -770,6 +770,36 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
   return addAttributes(C, Index, AttributeSet::get(C, Index, B));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                                        ArrayRef<unsigned> Indices,
+                                        Attribute A) const {
+  unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0;
+  auto IdxI = Indices.begin(), IdxE = Indices.end();
+  SmallVector<AttributeSet, 4> AttrSet;
+
+  while (I != E && IdxI != IdxE) {
+    if (getSlotIndex(I) < *IdxI)
+      AttrSet.emplace_back(getSlotAttributes(I++));
+    else if (getSlotIndex(I) > *IdxI)
+      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+    else {
+      AttrBuilder B(getSlotAttributes(I), *IdxI);
+      B.addAttribute(A);
+      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
+      ++I;
+      ++IdxI;
+    }
+  }
+
+  while (I != E)
+    AttrSet.emplace_back(getSlotAttributes(I++));
+
+  while (IdxI != IdxE)
+    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+
+  return get(C, AttrSet);
+}
+
 AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
                                          AttributeSet Attrs) const {
   if (!pImpl) return Attrs;
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index d79fb3e7b0ba..b4a07a1b6b4a 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -857,6 +857,57 @@ static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) {
   return true;
 }
 
+template <typename SequentialTy, typename ElementTy>
+static Constant *getIntSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty int sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      Elts.push_back(CI->getZExtValue());
+    else
+      return nullptr;
+  return SequentialTy::get(V[0]->getContext(), Elts);
+}
+
+template <typename SequentialTy, typename ElementTy>
+static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty FP sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CFP = dyn_cast<ConstantFP>(C))
+      Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+    else
+      return nullptr;
+  return SequentialTy::getFP(V[0]->getContext(), Elts);
+}
+
+template <typename SequenceTy>
+static Constant *getSequenceIfElementsMatch(Constant *C,
+                                            ArrayRef<Constant *> V) {
+  // We speculatively build the elements here even if it turns out that there is
+  // a constantexpr or something else weird, since it is so uncommon for that to
+  // happen.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    if (CI->getType()->isIntegerTy(8))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint8_t>(V);
+    else if (CI->getType()->isIntegerTy(16))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CI->getType()->isIntegerTy(32))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CI->getType()->isIntegerTy(64))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isFloatTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CFP->getType()->isDoubleTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  }
+
+  return nullptr;
+}
+
 ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantArrayVal,
              OperandTraits<ConstantArray>::op_end(this) - V.size(),
@@ -874,6 +925,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
     return C;
   return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   // Empty arrays are canonicalized to ConstantAggregateZero.
   if (V.empty())
@@ -896,74 +948,8 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataArray>(C, V);
 
   // Otherwise, we really do want to create a ConstantArray.
   return nullptr;
@@ -1059,6 +1045,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
   VectorType *Ty = VectorType::get(V.front()->getType(), V.size());
   return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
   VectorType *T = VectorType::get(V.front()->getType(), V.size());
@@ -1084,74 +1071,8 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 97e2a84a5cef..5e4d2d2054eb 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -411,12 +411,14 @@ void Function::clearGC() {
   }
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a Function) from the Function Src to this one.
+/// Copy all additional attributes (those not needed to create a Function) from
+/// the Function Src to this one.
 void Function::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<Function>(Src) && "Expected a Function!");
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = cast<Function>(Src);
+  const Function *SrcF = dyn_cast<Function>(Src);
+  if (!SrcF)
+    return;
+
   setCallingConv(SrcF->getCallingConv());
   setAttributes(SrcF->getAttributes());
   if (SrcF->hasGC())
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 8fde4b8e9d77..c538c7baa1fe 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -97,10 +97,11 @@ void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
 }
 
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
-  const auto *GV = cast<GlobalObject>(Src);
-  GlobalValue::copyAttributesFrom(GV);
-  setAlignment(GV->getAlignment());
-  setSection(GV->getSection());
+  GlobalValue::copyAttributesFrom(Src);
+  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
+    setAlignment(GV->getAlignment());
+    setSection(GV->getSection());
+  }
 }
 
 const char *GlobalValue::getSection() const {
@@ -216,14 +217,14 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
   }
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a GlobalVariable) from the GlobalVariable Src to this one.
+/// Copy all additional attributes (those not needed to create a GlobalVariable)
+/// from the GlobalVariable Src to this one.
 void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
   GlobalObject::copyAttributesFrom(Src);
-  const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
-  setThreadLocalMode(SrcVar->getThreadLocalMode());
-  setExternallyInitialized(SrcVar->isExternallyInitialized());
+  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
+    setThreadLocalMode(SrcVar->getThreadLocalMode());
+    setExternallyInitialized(SrcVar->isExternallyInitialized());
+  }
 }
 
 
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 617d965f4cfc..5cbb597ca269 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -95,6 +95,12 @@ struct VerifierSupport {
     Write(&*I);
   }
 
+  void Write(const Module *M) {
+    if (!M)
+      return;
+    OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+  }
+
   void Write(const Value *V) {
     if (!V)
       return;
@@ -1721,7 +1727,8 @@ void Verifier::visitFunction(const Function &F) {
     auto *Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
     if (Per)
       Assert(Per->getParent() == F.getParent(),
-             "Referencing personality function in another module!", &F, Per);
+             "Referencing personality function in another module!",
+             &F, F.getParent(), Per, Per->getParent());
   }
 
   if (F.isMaterializable()) {
@@ -3165,7 +3172,7 @@ void Verifier::visitInstruction(Instruction &I) {
           " donothing or patchpoint",
           &I);
       Assert(F->getParent() == M, "Referencing function in another module!",
-             &I);
+             &I, M, F, F->getParent());
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
       Assert(OpBB->getParent() == BB->getParent(),
              "Referring to a basic block in another function!", &I);
@@ -3173,7 +3180,7 @@ void Verifier::visitInstruction(Instruction &I) {
       Assert(OpArg->getParent() == BB->getParent(),
              "Referring to an argument in another function!", &I);
     } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
-      Assert(GV->getParent() == M, "Referencing global in another module!", &I);
+      Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent());
     } else if (isa<Instruction>(I.getOperand(i))) {
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 931bcf0d23fc..468ec24e3a06 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -67,14 +67,14 @@ const char* LTOCodeGenerator::getVersionString() {
 LTOCodeGenerator::LTOCodeGenerator()
     : Context(getGlobalContext()),
       MergedModule(new Module("ld-temp.o", Context)),
-      IRLinker(MergedModule.get()) {
+      IRLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
 LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
     : OwnedContext(std::move(Context)), Context(*OwnedContext),
       MergedModule(new Module("ld-temp.o", *OwnedContext)),
-      IRLinker(MergedModule.get()) {
+      IRLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
@@ -114,7 +114,7 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) {
   assert(&Mod->getModule().getContext() == &Context &&
          "Expected module in same context");
 
-  bool ret = IRLinker.linkInModule(&Mod->getModule());
+  bool ret = IRLinker->linkInModule(Mod->getModule());
 
   const std::vector<const char *> &undefs = Mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -130,7 +130,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker.setModule(MergedModule.get());
+  IRLinker = make_unique<Linker>(*MergedModule);
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 6b60379803e1..67613967f490 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -13,28 +13,18 @@
 
 #include "llvm/Linker/Linker.h"
 #include "llvm-c/Linker.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <cctype>
-#include <tuple>
 using namespace llvm;
 
-
 //===----------------------------------------------------------------------===//
 // TypeMap implementation.
 //===----------------------------------------------------------------------===//
@@ -42,22 +32,22 @@ using namespace llvm;
 namespace {
 class TypeMapTy : public ValueMapTypeRemapper {
   /// This is a mapping from a source type to a destination type to use.
-  DenseMap<Type*, Type*> MappedTypes;
+  DenseMap<Type *, Type *> MappedTypes;
 
   /// When checking to see if two subgraphs are isomorphic, we speculatively
   /// add types to MappedTypes, but keep track of them here in case we need to
   /// roll back.
-  SmallVector<Type*, 16> SpeculativeTypes;
+  SmallVector<Type *, 16> SpeculativeTypes;
 
-  SmallVector<StructType*, 16> SpeculativeDstOpaqueTypes;
+  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
 
   /// This is a list of non-opaque structs in the source module that are mapped
   /// to an opaque struct in the destination module.
-  SmallVector<StructType*, 16> SrcDefinitionsToResolve;
+  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
 
   /// This is the set of opaque types in the destination modules who are
   /// getting a body from the source module.
-  SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
+  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
 
 public:
   TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
@@ -179,7 +169,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 
   // Fail if any of the extra properties (e.g. array size) of the type disagree.
   if (isa<IntegerType>(DstTy))
-    return false;  // bitwidth disagrees.
+    return false; // bitwidth disagrees.
   if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
     if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
       return false;
@@ -215,7 +205,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 }
 
 void TypeMapTy::linkDefinedTypeBodies() {
-  SmallVector<Type*, 16> Elements;
+  SmallVector<Type *, 16> Elements;
   for (StructType *SrcSTy : SrcDefinitionsToResolve) {
     StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
     assert(DstSTy->isOpaque());
@@ -390,7 +380,8 @@ void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
-  Module *DstM, *SrcM;
+  Module &DstM;
+  Module &SrcM;
 
   TypeMapTy TypeMap;
   ValueMaterializerTy ValMaterializer;
@@ -401,16 +392,7 @@ class ModuleLinker {
   /// but this allows us to reuse the ValueMapper code.
   ValueToValueMapTy ValueMap;
 
-  struct AppendingVarInfo {
-    GlobalVariable *NewGV;   // New aggregate global in dest module.
-    const Constant *DstInit; // Old initializer from dest module.
-    const Constant *SrcInit; // Old initializer from src module.
-  };
-
-  std::vector<AppendingVarInfo> AppendingVars;
-
-  // Set of items not to link in from source.
-  SmallPtrSet<const Value *, 16> DoNotLinkFromSource;
+  SetVector<GlobalValue *> ValuesToLink;
 
   DiagnosticHandlerFunction DiagnosticHandler;
 
@@ -423,28 +405,29 @@ class ModuleLinker {
 
   /// Function to import from source module, all other functions are
   /// imported as declarations instead of definitions.
-  Function *ImportFunction;
+  DenseSet<const GlobalValue *> *ImportFunction;
 
   /// Set to true if the given FunctionInfoIndex contains any functions
   /// from this source module, in which case we must conservatively assume
   /// that any of its functions may be imported into another module
   /// as part of a different backend compilation process.
-  bool HasExportedFunctions;
+  bool HasExportedFunctions = false;
 
   /// Set to true when all global value body linking is complete (including
   /// lazy linking). Used to prevent metadata linking from creating new
   /// references.
-  bool DoneLinkingBodies;
+  bool DoneLinkingBodies = false;
+
+  bool HasError = false;
 
 public:
-  ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
+  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
                const FunctionInfoIndex *Index = nullptr,
-               Function *FuncToImport = nullptr)
-      : DstM(dstM), SrcM(srcM), TypeMap(Set), ValMaterializer(this),
+               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
+      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
         DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
-        ImportFunction(FuncToImport), HasExportedFunctions(false),
-        DoneLinkingBodies(false) {
+        ImportFunction(FunctionsToImport) {
     assert((ImportIndex || !ImportFunction) &&
            "Expect a FunctionInfoIndex when importing");
     // If we have a FunctionInfoIndex but no function to import,
@@ -469,20 +452,18 @@ class ModuleLinker {
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
   GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV,
-                                    const GlobalValue *DGV = nullptr);
+                                    const GlobalValue *DGV, bool ForDefinition);
 
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
 
-  /// Check if all global value body linking is complete.
-  bool doneLinkingBodies() { return DoneLinkingBodies; }
-
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
 
   /// Helper method for setting a message and returning an error code.
   bool emitError(const Twine &Message) {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
     return true;
   }
 
@@ -490,7 +471,7 @@ class ModuleLinker {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
   }
 
-  bool getComdatLeader(Module *M, StringRef ComdatName,
+  bool getComdatLeader(Module &M, StringRef ComdatName,
                        const GlobalVariable *&GVar);
   bool computeResultingSelectionKind(StringRef ComdatName,
                                      Comdat::SelectionKind Src,
@@ -513,7 +494,7 @@ class ModuleLinker {
       return nullptr;
 
     // Otherwise see if we have a match in the destination module's symtab.
-    GlobalValue *DGV = DstM->getNamedValue(getName(SrcGV));
+    GlobalValue *DGV = DstM.getNamedValue(getName(SrcGV));
     if (!DGV)
       return nullptr;
 
@@ -531,18 +512,17 @@ class ModuleLinker {
   void upgradeMismatchedGlobalArray(StringRef Name);
   void upgradeMismatchedGlobals();
 
+  bool linkIfNeeded(GlobalValue &GV);
   bool linkAppendingVarProto(GlobalVariable *DstGV,
                              const GlobalVariable *SrcGV);
 
   bool linkGlobalValueProto(GlobalValue *GV);
   bool linkModuleFlagsMetadata();
 
-  void linkAppendingVarInit(AppendingVarInfo &AVI);
-
   void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
   bool linkFunctionBody(Function &Dst, Function &Src);
   void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Src);
+  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
 
   /// Functions that take care of cloning a specific global value type
   /// into the destination module.
@@ -601,10 +581,10 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
   // If there is a conflict, rename the conflict.
   if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
     GV->takeName(ConflictGV);
-    ConflictGV->setName(Name);    // This will cause ConflictGV to get renamed
+    ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
     assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
   } else {
-    GV->setName(Name);              // Force the name back
+    GV->setName(Name); // Force the name back
   }
 }
 
@@ -612,18 +592,7 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
 /// from the SrcGV to the DestGV.
 void ModuleLinker::copyGVAttributes(GlobalValue *NewGV,
                                     const GlobalValue *SrcGV) {
-  auto *GA = dyn_cast<GlobalAlias>(SrcGV);
-  // Check for the special case of converting an alias (definition) to a
-  // non-alias (declaration). This can happen when we are importing and
-  // encounter a weak_any alias (weak_any defs may not be imported, see
-  // comments in ModuleLinker::getLinkage) or an alias whose base object is
-  // being imported as a declaration. In that case copy the attributes from the
-  // base object.
-  if (GA && !dyn_cast<GlobalAlias>(NewGV)) {
-    assert(isPerformingImport() && !doImportAsDefinition(GA));
-    NewGV->copyAttributesFrom(GA->getBaseObject());
-  } else
-    NewGV->copyAttributesFrom(SrcGV);
+  NewGV->copyAttributesFrom(SrcGV);
   forceRenaming(NewGV, getName(SrcGV));
 }
 
@@ -651,7 +620,7 @@ bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
     return true;
   // Only import the function requested for importing.
   auto *SF = dyn_cast<Function>(SGV);
-  if (SF && SF == ImportFunction)
+  if (SF && ImportFunction->count(SF))
     return true;
   // Otherwise no.
   return false;
@@ -798,11 +767,12 @@ ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV = new GlobalVariable(
-      *DstM, TypeMap.get(SGVar->getType()->getElementType()),
-      SGVar->isConstant(), getLinkage(SGVar), /*init*/ nullptr, getName(SGVar),
-      /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-      SGVar->getType()->getAddressSpace());
+  GlobalVariable *NewDGV =
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
+                         /*init*/ nullptr, getName(SGVar),
+                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+                         SGVar->getType()->getAddressSpace());
 
   return NewDGV;
 }
@@ -813,42 +783,18 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
                                           const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()), getLinkage(SF),
-                          getName(SF), DstM);
+  return Function::Create(TypeMap.get(SF->getFunctionType()),
+                          GlobalValue::ExternalLinkage, getName(SF), &DstM);
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
 GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
                                                 const GlobalAlias *SGA) {
-  // If we are importing and encounter a weak_any alias, or an alias to
-  // an object being imported as a declaration, we must import the alias
-  // as a declaration as well, which involves converting it to a non-alias.
-  // See comments in ModuleLinker::getLinkage for why we cannot import
-  // weak_any defintions.
-  if (isPerformingImport() && !doImportAsDefinition(SGA)) {
-    // Need to convert to declaration. All aliases must be definitions.
-    const GlobalValue *GVal = SGA->getBaseObject();
-    GlobalValue *NewGV;
-    if (auto *GVar = dyn_cast<GlobalVariable>(GVal))
-      NewGV = copyGlobalVariableProto(TypeMap, GVar);
-    else {
-      auto *F = dyn_cast<Function>(GVal);
-      assert(F);
-      NewGV = copyFunctionProto(TypeMap, F);
-    }
-    // Set the linkage to External or ExternalWeak (see comments in
-    // ModuleLinker::getLinkage for why WeakAny is converted to ExternalWeak).
-    if (SGA->hasWeakAnyLinkage())
-      NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
-    else
-      NewGV->setLinkage(GlobalValue::ExternalLinkage);
-    return NewGV;
-  }
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
   return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             getLinkage(SGA), getName(SGA), DstM);
+                             GlobalValue::ExternalLinkage, getName(SGA), &DstM);
 }
 
 static GlobalValue::VisibilityTypes
@@ -876,14 +822,31 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
 
 GlobalValue *ModuleLinker::copyGlobalValueProto(TypeMapTy &TypeMap,
                                                 const GlobalValue *SGV,
-                                                const GlobalValue *DGV) {
+                                                const GlobalValue *DGV,
+                                                bool ForDefinition) {
   GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV))
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
     NewGV = copyGlobalVariableProto(TypeMap, SGVar);
-  else if (auto *SF = dyn_cast<Function>(SGV))
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
     NewGV = copyFunctionProto(TypeMap, SF);
-  else
-    NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
+  } else {
+    if (ForDefinition)
+      NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
+    else
+      NewGV = new GlobalVariable(
+          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          /*isConstant*/ false, GlobalValue::ExternalLinkage,
+          /*init*/ nullptr, getName(SGV),
+          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
+          SGV->getType()->getAddressSpace());
+  }
+
+  if (ForDefinition)
+    NewGV->setLinkage(getLinkage(SGV));
+  else if (SGV->hasAvailableExternallyLinkage() || SGV->hasWeakLinkage() ||
+           SGV->hasLinkOnceLinkage())
+    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
+
   copyGVAttributes(NewGV, SGV);
   setVisibility(NewGV, SGV, DGV);
   return NewGV;
@@ -898,22 +861,8 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (!SGV)
     return nullptr;
 
-  // If we are done linking global value bodies (i.e. we are performing
-  // metadata linking), don't link in the global value due to this
-  // reference, simply map it to null.
-  if (doneLinkingBodies())
-    return nullptr;
-
-  GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV);
-
-  if (Comdat *SC = SGV->getComdat()) {
-    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
-      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
-      DGO->setComdat(DC);
-    }
-  }
-
-  return DGV;
+  linkGlobalValueProto(SGV);
+  return ValueMap[SGV];
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -921,22 +870,39 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
   return ModLinker->materializeInitFor(New, Old);
 }
 
+static bool shouldLazyLink(const GlobalValue &GV) {
+  return GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+         GV.hasAvailableExternallyLinkage();
+}
+
 void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (Old->isDeclaration())
+    return;
+
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  // Skip declarations that ValueMaterializer may have created in
-  // case we link in only some of SrcM.
-  if (shouldLinkOnlyNeeded() && Old->isDeclaration())
+  if (!ValuesToLink.count(Old) && !shouldLazyLink(*Old))
     return;
 
-  assert(!Old->isDeclaration() && "users should not pass down decls");
-  linkGlobalValueBody(*Old);
+  linkGlobalValueBody(*New, *Old);
 }
 
-bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName,
+bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
-  const GlobalValue *GVal = M->getNamedValue(ComdatName);
+  const GlobalValue *GVal = M.getNamedValue(ComdatName);
   if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) {
     GVal = GA->getBaseObject();
     if (!GVal)
@@ -995,8 +961,8 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
         getComdatLeader(SrcM, ComdatName, SrcGV))
       return true;
 
-    const DataLayout &DstDL = DstM->getDataLayout();
-    const DataLayout &SrcDL = SrcM->getDataLayout();
+    const DataLayout &DstDL = DstM.getDataLayout();
+    const DataLayout &SrcDL = SrcM.getDataLayout();
     uint64_t DstSize =
         DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType());
     uint64_t SrcSize =
@@ -1028,7 +994,7 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    bool &LinkFromSrc) {
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
-  Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable();
+  Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
   Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName);
 
   if (DstCI == ComdatSymTab.end()) {
@@ -1069,7 +1035,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
     if (isa<Function>(&Src)) {
       // For functions, LinkFromSrc iff this is the function requested
       // for importing. For variables, decide below normally.
-      LinkFromSrc = (&Src == ImportFunction);
+      LinkFromSrc = ImportFunction->count(&Src);
       return false;
     }
 
@@ -1156,7 +1122,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 /// types 'Foo' but one got renamed when the module was loaded into the same
 /// LLVMContext.
 void ModuleLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM->globals()) {
+  for (GlobalValue &SGV : SrcM.globals()) {
     GlobalValue *DGV = getLinkedToGlobal(&SGV);
     if (!DGV)
       continue;
@@ -1172,12 +1138,12 @@ void ModuleLinker::computeTypeMapping() {
     TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
   }
 
-  for (GlobalValue &SGV : *SrcM) {
+  for (GlobalValue &SGV : SrcM) {
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
   }
 
-  for (GlobalValue &SGV : SrcM->aliases()) {
+  for (GlobalValue &SGV : SrcM.aliases()) {
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
   }
@@ -1186,7 +1152,7 @@ void ModuleLinker::computeTypeMapping() {
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
+  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
   for (StructType *ST : Types) {
     if (!ST->hasName())
       continue;
@@ -1199,7 +1165,7 @@ void ModuleLinker::computeTypeMapping() {
       continue;
 
     // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos));
+    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
     if (!DST)
       continue;
 
@@ -1273,10 +1239,10 @@ static void upgradeGlobalArray(GlobalVariable *GV) {
 
 void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
   // Look for the global arrays.
-  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM->getNamedValue(Name));
+  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM.getNamedValue(Name));
   if (!DstGV)
     return;
-  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM->getNamedValue(Name));
+  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM.getNamedValue(Name));
   if (!SrcGV)
     return;
 
@@ -1306,6 +1272,14 @@ void ModuleLinker::upgradeMismatchedGlobals() {
   upgradeMismatchedGlobalArray("llvm.global_dtors");
 }
 
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+
 /// If there were any appending global variables, link them together now.
 /// Return true on error.
 bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
@@ -1314,10 +1288,8 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
       cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
   Type *EltTy = SrcTy->getElementType();
 
-  uint64_t NewSize = SrcTy->getNumElements();
   if (DstGV) {
     ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-    NewSize += DstTy->getNumElements();
 
     if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
       return emitError(
@@ -1347,35 +1319,55 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
           "Appending variables with different section name need to be linked!");
   }
 
+  SmallVector<Constant *, 16> DstElements;
+  if (DstGV)
+    getArrayElements(DstGV->getInitializer(), DstElements);
+
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(SrcGV->getInitializer(), SrcElements);
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor =
+      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
+      cast<StructType>(EltTy)->getNumElements() == 3;
+  if (IsNewStructor)
+    SrcElements.erase(
+        std::remove_if(SrcElements.begin(), SrcElements.end(),
+                       [this](Constant *E) {
+                         auto *Key = dyn_cast<GlobalValue>(
+                             E->getAggregateElement(2)->stripPointerCasts());
+                         return Key && !ValuesToLink.count(Key) &&
+                                !shouldLazyLink(*Key);
+                       }),
+        SrcElements.end());
+  uint64_t NewSize = DstElements.size() + SrcElements.size();
   ArrayType *NewType = ArrayType::get(EltTy, NewSize);
 
   // Create the new global variable.
   GlobalVariable *NG = new GlobalVariable(
-      *DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
+      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
       /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
       SrcGV->getType()->getAddressSpace());
 
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NG, SrcGV);
 
-  AppendingVarInfo AVI;
-  AVI.NewGV = NG;
-  AVI.DstInit = DstGV ? DstGV->getInitializer() : nullptr;
-  AVI.SrcInit = SrcGV->getInitializer();
-  AppendingVars.push_back(AVI);
-
   // Replace any uses of the two global variables with uses of the new
   // global.
   ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
 
+  for (auto *V : SrcElements) {
+    DstElements.push_back(
+        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
+  }
+
+  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+
   if (DstGV) {
     DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
     DstGV->eraseFromParent();
   }
 
-  // Track the source variable so we don't try to link it.
-  DoNotLinkFromSource.insert(SrcGV);
-
   return false;
 }
 
@@ -1384,14 +1376,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
 
   // Handle the ultra special appending linkage case first.
   assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage() && isPerformingImport()) {
-    // Don't want to append to global_ctors list, for example, when we
-    // are importing for ThinLTO, otherwise the global ctors and dtors
-    // get executed multiple times for local variables (the latter causing
-    // double frees).
-    DoNotLinkFromSource.insert(SGV);
-    return false;
-  }
   if (SGV->hasAppendingLinkage())
     return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
                                  cast<GlobalVariable>(SGV));
@@ -1400,66 +1384,47 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   Comdat *C = nullptr;
   bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
-  if (const Comdat *SC = SGV->getComdat()) {
+  if (isPerformingImport() && !doImportAsDefinition(SGV)) {
+    LinkFromSrc = false;
+  } else if (const Comdat *SC = SGV->getComdat()) {
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    C = DstM->getOrInsertComdat(SC->getName());
+    C = DstM.getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
-    ComdatMembers[SC].push_back(SGV);
+    if (SGV->hasLocalLinkage())
+      LinkFromSrc = true;
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
       return true;
   }
 
-  if (!LinkFromSrc) {
-    // Track the source global so that we don't attempt to copy it over when
-    // processing global initializers.
-    DoNotLinkFromSource.insert(SGV);
-
-    if (DGV)
-      // Make sure to remember this mapping.
-      ValueMap[SGV] =
-          ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
+  if (!LinkFromSrc && DGV) {
+    // Make sure to remember this mapping.
+    ValueMap[SGV] = ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
   }
 
   if (DGV)
     HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
-  if (!LinkFromSrc && !DGV)
-    return false;
-
   GlobalValue *NewGV;
-  if (!LinkFromSrc) {
+  if (!LinkFromSrc && DGV) {
     NewGV = DGV;
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
-    // If the GV is to be lazily linked, don't create it just yet.
-    // The ValueMaterializerTy will deal with creating it if it's used.
-    if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction &&
-        (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
-         SGV->hasAvailableExternallyLinkage())) {
-      DoNotLinkFromSource.insert(SGV);
+    // If we are done linking global value bodies (i.e. we are performing
+    // metadata linking), don't link in the global value due to this
+    // reference, simply map it to null.
+    if (DoneLinkingBodies)
       return false;
-    }
 
-    // When we only want to link in unresolved dependencies, blacklist
-    // the symbol unless unless DestM has a matching declaration (DGV).
-    if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
-    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV);
-
-    if (isPerformingImport() && !doImportAsDefinition(SGV))
-      DoNotLinkFromSource.insert(SGV);
+    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
   }
 
   NewGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C)
+    if (C && LinkFromSrc)
       NewGO->setComdat(C);
 
     if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
@@ -1486,56 +1451,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   return false;
 }
 
-static void getArrayElements(const Constant *C,
-                             SmallVectorImpl<Constant *> &Dest) {
-  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
-
-  for (unsigned i = 0; i != NumElements; ++i)
-    Dest.push_back(C->getAggregateElement(i));
-}
-
-void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) {
-  // Merge the initializer.
-  SmallVector<Constant *, 16> DstElements;
-  if (AVI.DstInit)
-    getArrayElements(AVI.DstInit, DstElements);
-
-  SmallVector<Constant *, 16> SrcElements;
-  getArrayElements(AVI.SrcInit, SrcElements);
-
-  ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
-
-  StringRef Name = AVI.NewGV->getName();
-  bool IsNewStructor =
-      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
-      cast<StructType>(NewType->getElementType())->getNumElements() == 3;
-
-  for (auto *V : SrcElements) {
-    if (IsNewStructor) {
-      Constant *Key = V->getAggregateElement(2);
-      if (DoNotLinkFromSource.count(Key))
-        continue;
-    }
-    DstElements.push_back(
-        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-  }
-  if (DstElements.size() != NewType->getNumElements()) {
-    NewType = ArrayType::get(NewType->getElementType(), DstElements.size());
-    GlobalVariable *Old = AVI.NewGV;
-    GlobalVariable *NG = new GlobalVariable(
-        *DstM, NewType, Old->isConstant(), Old->getLinkage(), /*init*/ nullptr,
-        /*name*/ "", Old, Old->getThreadLocalMode(),
-        Old->getType()->getAddressSpace());
-    copyGVAttributes(NG, Old);
-    AVI.NewGV->replaceAllUsesWith(
-        ConstantExpr::getBitCast(NG, AVI.NewGV->getType()));
-    AVI.NewGV->eraseFromParent();
-    AVI.NewGV = NG;
-  }
-
-  AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements));
-}
-
 /// Update the initializers in the Dest module now that all globals that may be
 /// referenced are in Dest.
 void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
@@ -1574,7 +1489,7 @@ bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
   // Go through and convert function arguments over, remembering the mapping.
   Function::arg_iterator DI = Dst.arg_begin();
   for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName());  // Copy the name over.
+    DI->setName(Arg.getName()); // Copy the name over.
 
     // Add a mapping to our mapping.
     ValueMap[&Arg] = &*DI;
@@ -1616,9 +1531,7 @@ void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
   Dst.setAliasee(Val);
 }
 
-bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
-  Value *Dst = ValueMap[&Src];
-  assert(Dst);
+bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
   if (const Comdat *SC = Src.getComdat()) {
     // To ensure that we don't generate an incomplete comdat group,
     // we must materialize and map in any other members that are not
@@ -1633,26 +1546,26 @@ bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
     }
   }
   if (shouldInternalizeLinkedSymbols())
-    if (auto *DGV = dyn_cast<GlobalValue>(Dst))
+    if (auto *DGV = dyn_cast<GlobalValue>(&Dst))
       DGV->setLinkage(GlobalValue::InternalLinkage);
   if (auto *F = dyn_cast<Function>(&Src))
-    return linkFunctionBody(cast<Function>(*Dst), *F);
+    return linkFunctionBody(cast<Function>(Dst), *F);
   if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
-    linkGlobalInit(cast<GlobalVariable>(*Dst), *GVar);
+    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
     return false;
   }
-  linkAliasBody(cast<GlobalAlias>(*Dst), cast<GlobalAlias>(Src));
+  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
   return false;
 }
 
 /// Insert all of the named MDNodes in Src into the Dest module.
 void ModuleLinker::linkNamedMDNodes() {
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM->named_metadata()) {
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
     // Don't link module flags here. Do them separately.
     if (&NMD == SrcModFlags)
       continue;
-    NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(NMD.getName());
+    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
     // Add Src elements into Dest node.
     for (const MDNode *op : NMD.operands())
       DestNMD->addOperand(MapMetadata(
@@ -1664,12 +1577,13 @@ void ModuleLinker::linkNamedMDNodes() {
 /// Merge the linker flags in Src into the Dest module.
 bool ModuleLinker::linkModuleFlagsMetadata() {
   // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
-  if (!SrcModFlags) return false;
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  if (!SrcModFlags)
+    return false;
 
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
-  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
+  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
   if (DstModFlags->getNumOperands() == 0) {
     for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
       DstModFlags->addOperand(SrcModFlags->getOperand(I));
@@ -1679,7 +1593,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
   // First build a map of the existing module flags and requirements.
   DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
-  SmallSetVector<MDNode*, 16> Requirements;
+  SmallSetVector<MDNode *, 16> Requirements;
   for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = DstModFlags->getOperand(I);
     ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
@@ -1752,7 +1666,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
     auto replaceDstValue = [&](MDNode *New) {
       Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
-      MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps);
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
       DstModFlags->setOperand(DstIndex, Flag);
       Flags[ID].first = Flag;
     };
@@ -1760,7 +1674,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
-    case Module::Override: llvm_unreachable("not possible");
+    case Module::Override:
+      llvm_unreachable("not possible");
     case Module::Error: {
       // Emit an error if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
@@ -1785,7 +1700,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       MDs.append(DstValue->op_begin(), DstValue->op_end());
       MDs.append(SrcValue->op_begin(), SrcValue->op_end());
 
-      replaceDstValue(MDNode::get(DstM->getContext(), MDs));
+      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
       break;
     }
     case Module::AppendUnique: {
@@ -1795,7 +1710,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       Elts.insert(DstValue->op_begin(), DstValue->op_end());
       Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
 
-      replaceDstValue(MDNode::get(DstM->getContext(),
+      replaceDstValue(MDNode::get(DstM.getContext(),
                                   makeArrayRef(Elts.begin(), Elts.end())));
       break;
     }
@@ -1823,16 +1738,15 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 static bool triplesMatch(const Triple &T0, const Triple &T1) {
   // If vendor is apple, ignore the version number.
   if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() &&
-           T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() &&
-           T0.getOS() == T1.getOS();
+    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
 
   return T0 == T1;
 }
 
 // This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) {
+static std::string mergeTriples(const Triple &SrcTriple,
+                                const Triple &DstTriple) {
   // If vendor is apple, pick the triple with the larger version number.
   if (SrcTriple.getVendor() == Triple::Apple)
     if (DstTriple.isOSVersionLT(SrcTriple))
@@ -1841,52 +1755,112 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple
   return DstTriple.str();
 }
 
-bool ModuleLinker::run() {
-  assert(DstM && "Null destination module");
-  assert(SrcM && "Null source module");
+bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
+  GlobalValue *DGV = getLinkedToGlobal(&GV);
+
+  if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
+    return false;
+
+  if (DGV && !GV.hasLocalLinkage() && !GV.hasAppendingLinkage()) {
+    auto *DGVar = dyn_cast<GlobalVariable>(DGV);
+    auto *SGVar = dyn_cast<GlobalVariable>(&GV);
+    if (DGVar && SGVar) {
+      if (DGVar->isDeclaration() && SGVar->isDeclaration() &&
+          (!DGVar->isConstant() || !SGVar->isConstant())) {
+        DGVar->setConstant(false);
+        SGVar->setConstant(false);
+      }
+      if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) {
+        unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment());
+        SGVar->setAlignment(Align);
+        DGVar->setAlignment(Align);
+      }
+    }
+
+    GlobalValue::VisibilityTypes Visibility =
+        getMinVisibility(DGV->getVisibility(), GV.getVisibility());
+    DGV->setVisibility(Visibility);
+    GV.setVisibility(Visibility);
+
+    bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr();
+    DGV->setUnnamedAddr(HasUnnamedAddr);
+    GV.setUnnamedAddr(HasUnnamedAddr);
+  }
+
+  // Don't want to append to global_ctors list, for example, when we
+  // are importing for ThinLTO, otherwise the global ctors and dtors
+  // get executed multiple times for local variables (the latter causing
+  // double frees).
+  if (GV.hasAppendingLinkage() && isPerformingImport())
+    return false;
+
+  if (isPerformingImport() && !doImportAsDefinition(&GV))
+    return false;
+
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
+    return false;
+
+  if (const Comdat *SC = GV.getComdat()) {
+    bool LinkFromSrc;
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    if (LinkFromSrc)
+      ValuesToLink.insert(&GV);
+    return false;
+  }
+
+  bool LinkFromSrc = true;
+  if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV))
+    return true;
+  if (LinkFromSrc)
+    ValuesToLink.insert(&GV);
+  return false;
+}
 
+bool ModuleLinker::run() {
   // Inherit the target data from the source module if the destination module
   // doesn't have one already.
-  if (DstM->getDataLayout().isDefault())
-    DstM->setDataLayout(SrcM->getDataLayout());
+  if (DstM.getDataLayout().isDefault())
+    DstM.setDataLayout(SrcM.getDataLayout());
 
-  if (SrcM->getDataLayout() != DstM->getDataLayout()) {
+  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
     emitWarning("Linking two modules of different data layouts: '" +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getDataLayoutStr() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getDataLayoutStr() + "'\n");
+                SrcM.getModuleIdentifier() + "' is '" +
+                SrcM.getDataLayoutStr() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" +
+                DstM.getDataLayoutStr() + "'\n");
   }
 
   // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
-    DstM->setTargetTriple(SrcM->getTargetTriple());
+  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM.getTargetTriple());
 
-  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple());
+  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
 
-  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
     emitWarning("Linking two modules of different target triples: " +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getTargetTriple() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getTargetTriple() + "'\n");
+                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
+                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
+                DstM.getTargetTriple() + "'\n");
 
-  DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
-  if (!SrcM->getModuleInlineAsm().empty()) {
-    if (DstM->getModuleInlineAsm().empty())
-      DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm());
+  if (!SrcM.getModuleInlineAsm().empty()) {
+    if (DstM.getModuleInlineAsm().empty())
+      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
     else
-      DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+
-                               SrcM->getModuleInlineAsm());
+      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
+                              SrcM.getModuleInlineAsm());
   }
 
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
   ComdatsChosen.clear();
-  for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
+  for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
       continue;
@@ -1900,69 +1874,38 @@ bool ModuleLinker::run() {
   // Upgrade mismatched global arrays.
   upgradeMismatchedGlobals();
 
+  for (GlobalVariable &GV : SrcM.globals())
+    if (const Comdat *SC = GV.getComdat())
+      ComdatMembers[SC].push_back(&GV);
+
+  for (Function &SF : SrcM)
+    if (const Comdat *SC = SF.getComdat())
+      ComdatMembers[SC].push_back(&SF);
+
+  for (GlobalAlias &GA : SrcM.aliases())
+    if (const Comdat *SC = GA.getComdat())
+      ComdatMembers[SC].push_back(&GA);
+
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
-  for (GlobalVariable &GV : SrcM->globals())
-    if (linkGlobalValueProto(&GV))
+  for (GlobalVariable &GV : SrcM.globals())
+    if (linkIfNeeded(GV))
       return true;
 
-  // Link the functions together between the two modules, without doing function
-  // bodies... this just adds external function prototypes to the DstM
-  // function...  We do this so that when we begin processing function bodies,
-  // all of the global values that may be referenced are available in our
-  // ValueMap.
-  for (Function &F :*SrcM)
-    if (linkGlobalValueProto(&F))
+  for (Function &SF : SrcM)
+    if (linkIfNeeded(SF))
       return true;
 
-  // If there were any aliases, link them now.
-  for (GlobalAlias &GA : SrcM->aliases())
-    if (linkGlobalValueProto(&GA))
+  for (GlobalAlias &GA : SrcM.aliases())
+    if (linkIfNeeded(GA))
       return true;
 
-  for (AppendingVarInfo &AppendingVar : AppendingVars)
-    linkAppendingVarInit(AppendingVar);
-
-  for (const auto &Entry : DstM->getComdatSymbolTable()) {
-    const Comdat &C = Entry.getValue();
-    if (C.getSelectionKind() == Comdat::Any)
-      continue;
-    const GlobalValue *GV = SrcM->getNamedValue(C.getName());
-    if (GV)
-      MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-  }
-
-  // Link in the function bodies that are defined in the source module into
-  // DstM.
-  for (Function &SF : *SrcM) {
-    // Skip if no body (function is external).
-    if (SF.isDeclaration())
-      continue;
-
-    // Skip if not linking from source.
-    if (DoNotLinkFromSource.count(&SF))
-      continue;
-
-    if (linkGlobalValueBody(SF))
+  for (GlobalValue *GV : ValuesToLink) {
+    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+    if (HasError)
       return true;
   }
 
-  // Resolve all uses of aliases with aliasees.
-  for (GlobalAlias &Src : SrcM->aliases()) {
-    if (DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
-  // Update the initializers in the DstM module now that all globals that may
-  // be referenced are in DstM.
-  for (GlobalVariable &Src : SrcM->globals()) {
-    // Only process initialized GV's or ones not already in dest.
-    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
   // Note that we are done linking global value bodies. This prevents
   // metadata linking from creating new references.
   DoneLinkingBodies = true;
@@ -2069,12 +2012,10 @@ bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
   return *I == Ty;
 }
 
-void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  this->Composite = M;
-  this->DiagnosticHandler = DiagnosticHandler;
-
+Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
+    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
   TypeFinder StructTypes;
-  StructTypes.run(*M, true);
+  StructTypes.run(M, true);
   for (StructType *Ty : StructTypes) {
     if (Ty->isOpaque())
       IdentifiedStructTypes.addOpaque(Ty);
@@ -2083,35 +2024,21 @@ void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
   }
 }
 
-Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  init(M, DiagnosticHandler);
-}
-
-Linker::Linker(Module *M) {
-  init(M, [this](const DiagnosticInfo &DI) {
-    Composite->getContext().diagnose(DI);
-  });
-}
-
-void Linker::deleteModule() {
-  delete Composite;
-  Composite = nullptr;
-}
+Linker::Linker(Module &M)
+    : Linker(M, [this](const DiagnosticInfo &DI) {
+        Composite.getContext().diagnose(DI);
+      }) {}
 
-bool Linker::linkInModule(Module *Src, unsigned Flags,
+bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
-                          Function *FuncToImport) {
+                          DenseSet<const GlobalValue *> *FunctionsToImport) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
-                         DiagnosticHandler, Flags, Index, FuncToImport);
+                         DiagnosticHandler, Flags, Index, FunctionsToImport);
   bool RetCode = TheLinker.run();
-  Composite->dropTriviallyDeadConstantArrays();
+  Composite.dropTriviallyDeadConstantArrays();
   return RetCode;
 }
 
-void Linker::setModule(Module *Dst) {
-  init(Dst, DiagnosticHandler);
-}
-
 //===----------------------------------------------------------------------===//
 // LinkModules entrypoint.
 //===----------------------------------------------------------------------===//
@@ -2121,14 +2048,14 @@ void Linker::setModule(Module *Dst) {
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src,
+bool Linker::linkModules(Module &Dest, Module &Src,
                          DiagnosticHandlerFunction DiagnosticHandler,
                          unsigned Flags) {
   Linker L(Dest, DiagnosticHandler);
   return L.linkInModule(Src, Flags);
 }
 
-bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Flags) {
+bool Linker::linkModules(Module &Dest, Module &Src, unsigned Flags) {
   Linker L(Dest);
   return L.linkInModule(Src, Flags);
 }
@@ -2144,8 +2071,8 @@ LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
   raw_string_ostream Stream(Message);
   DiagnosticPrinterRawOStream DP(Stream);
 
-  LLVMBool Result = Linker::LinkModules(
-      D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
+  LLVMBool Result = Linker::linkModules(
+      *D, *unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
 
   if (OutMessages && Result) {
     Stream.flush();
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 52ecf9fcfbf3..21f7571eec4a 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -60,6 +60,7 @@ class MCMachOStreamer : public MCObjectStreamer {
 
   /// state management
   void reset() override {
+    CreatedADWARFSection = false;
     HasSectionLabel.clear();
     MCObjectStreamer::reset();
   }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 8b75457a2460..41e28698b1cc 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -259,6 +259,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   DwarfDebugInlineSection =
       Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
+  DwarfCUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps",
                                          0, SectionKind::getMetadata());
 
@@ -531,6 +534,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   DwarfAddrSection =
       Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
 
+  // DWP Sections
+  DwarfCUIndexSection =
+      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 
@@ -713,6 +720,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "addr_sec");
+  DwarfCUIndexSection = Ctx->getCOFFSection(
+      ".debug_cu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
diff --git a/lib/Object/FunctionIndexObjectFile.cpp b/lib/Object/FunctionIndexObjectFile.cpp
index 717c56bc9018..fe111de1a9c8 100644
--- a/lib/Object/FunctionIndexObjectFile.cpp
+++ b/lib/Object/FunctionIndexObjectFile.cpp
@@ -86,7 +86,7 @@ bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer(
 ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
 FunctionIndexObjectFile::create(MemoryBufferRef Object,
                                 DiagnosticHandlerFunction DiagnosticHandler,
-                                const Module *ExportingModule, bool IsLazy) {
+                                bool IsLazy) {
   std::unique_ptr<FunctionInfoIndex> Index;
 
   ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
@@ -94,7 +94,7 @@ FunctionIndexObjectFile::create(MemoryBufferRef Object,
     return BCOrErr.getError();
 
   ErrorOr<std::unique_ptr<FunctionInfoIndex>> IOrErr = getFunctionInfoIndex(
-      BCOrErr.get(), DiagnosticHandler, ExportingModule, IsLazy);
+      BCOrErr.get(), DiagnosticHandler, IsLazy);
 
   if (std::error_code EC = IOrErr.getError())
     return EC;
@@ -125,8 +125,7 @@ std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer(
 // index object if found, or nullptr if not.
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 llvm::getFunctionIndexForFile(StringRef Path,
-                              DiagnosticHandlerFunction DiagnosticHandler,
-                              const Module *ExportingModule) {
+                              DiagnosticHandlerFunction DiagnosticHandler) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   std::error_code EC = FileOrErr.getError();
@@ -134,8 +133,7 @@ llvm::getFunctionIndexForFile(StringRef Path,
     return EC;
   MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
   ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
-      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler,
-                                              ExportingModule);
+      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler);
   EC = ObjOrErr.getError();
   if (EC)
     return EC;
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index d08ec9d73176..530be8ac044a 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -207,7 +207,7 @@ ValueProfData::serializeFrom(const InstrProfRecord &Record) {
   InstrProfRecordClosure.Record = &Record;
 
   std::unique_ptr<ValueProfData> VPD(
-      serializeValueProfDataFrom(&InstrProfRecordClosure, 0));
+      serializeValueProfDataFrom(&InstrProfRecordClosure, nullptr));
   return VPD;
 }
 
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index e4348b19ac02..cfc968739806 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -441,11 +441,11 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   return DataBuffer;
 }
 
-std::error_code
-InstrProfReaderIndex::getRecords(StringRef FuncName,
-                                 ArrayRef<InstrProfRecord> &Data) {
-  auto Iter = Index->find(FuncName);
-  if (Iter == Index->end())
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+    StringRef FuncName, ArrayRef<InstrProfRecord> &Data) {
+  auto Iter = HashTable->find(FuncName);
+  if (Iter == HashTable->end())
     return instrprof_error::unknown_function;
 
   Data = (*Iter);
@@ -455,9 +455,11 @@ InstrProfReaderIndex::getRecords(StringRef FuncName,
   return instrprof_error::success;
 }
 
-std::error_code InstrProfReaderIndex::getRecords(
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
     ArrayRef<InstrProfRecord> &Data) {
-  if (atEnd()) return instrprof_error::eof;
+  if (atEnd())
+    return instrprof_error::eof;
 
   Data = *RecordIterator;
 
@@ -466,25 +468,26 @@ std::error_code InstrProfReaderIndex::getRecords(
   return instrprof_error::success;
 }
 
-void InstrProfReaderIndex::Init(const unsigned char *Buckets,
-                                const unsigned char *const Payload,
-                                const unsigned char *const Base,
-                                IndexedInstrProf::HashT HashType,
-                                uint64_t Version) {
+template <typename HashTableImpl>
+InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
+    const unsigned char *Buckets, const unsigned char *const Payload,
+    const unsigned char *const Base, IndexedInstrProf::HashT HashType,
+    uint64_t Version) {
   FormatVersion = Version;
-  Index.reset(IndexType::Create(Buckets, Payload, Base,
-                                InstrProfLookupTrait(HashType, Version)));
+  HashTable.reset(HashTableImpl::Create(
+      Buckets, Payload, Base,
+      typename HashTableImpl::InfoType(HashType, Version)));
   // Form the map of hash values to const char* keys in profiling data.
   std::vector<std::pair<uint64_t, const char *>> HashKeys;
-  for (auto Key : Index->keys()) {
+  for (auto Key : HashTable->keys()) {
     const char *KeyTableRef = StringTable.insertString(Key);
     HashKeys.push_back(std::make_pair(ComputeHash(HashType, Key), KeyTableRef));
   }
   std::sort(HashKeys.begin(), HashKeys.end(), less_first());
   HashKeys.erase(std::unique(HashKeys.begin(), HashKeys.end()), HashKeys.end());
   // Set the hash key map for the InstrLookupTrait
-  Index->getInfoObj().setHashKeys(std::move(HashKeys));
-  RecordIterator = Index->data_begin();
+  HashTable->getInfoObj().setHashKeys(std::move(HashKeys));
+  RecordIterator = HashTable->data_begin();
 }
 
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
@@ -532,8 +535,10 @@ std::error_code IndexedInstrProfReader::readHeader() {
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
   // The rest of the file is an on disk hash table.
-  Index.Init(Start + HashOffset, Cur, Start, HashType, FormatVersion);
-
+  InstrProfReaderIndexBase *IndexPtr = nullptr;
+  IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>(
+      Start + HashOffset, Cur, Start, HashType, FormatVersion);
+  Index.reset(IndexPtr);
   return success();
 }
 
@@ -541,7 +546,7 @@ ErrorOr<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
   ArrayRef<InstrProfRecord> Data;
-  std::error_code EC = Index.getRecords(FuncName, Data);
+  std::error_code EC = Index->getRecords(FuncName, Data);
   if (EC != instrprof_error::success)
     return EC;
   // Found it. Look for counters with the right hash.
@@ -571,13 +576,13 @@ std::error_code IndexedInstrProfReader::readNextRecord(
 
   ArrayRef<InstrProfRecord> Data;
 
-  std::error_code EC = Index.getRecords(Data);
+  std::error_code EC = Index->getRecords(Data);
   if (EC != instrprof_error::success)
     return error(EC);
 
   Record = Data[RecordIndex++];
   if (RecordIndex >= Data.size()) {
-    Index.advanceToNextKey();
+    Index->advanceToNextKey();
     RecordIndex = 0;
   }
   return success();
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index f9cc2afe3da0..78bec012eeb2 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -107,22 +107,23 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) {
   std::tie(Where, NewFunc) =
       ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord()));
   InstrProfRecord &Dest = Where->second;
+
+  instrprof_error Result;
   if (NewFunc) {
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
+    Result = instrprof_error::success;
   } else {
     // We're updating a function we've seen before.
-    instrprof_error MergeResult = Dest.merge(I);
-    if (MergeResult != instrprof_error::success) {
-      return MergeResult;
-    }
+    Result = Dest.merge(I);
   }
 
   // We keep track of the max function count as we go for simplicity.
+  // Update this statistic no matter the result of the merge.
   if (Dest.Counts[0] > MaxFunctionCount)
     MaxFunctionCount = Dest.Counts[0];
 
-  return instrprof_error::success;
+  return Result;
 }
 
 std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 3b0f6e6f06e4..771d02c0aa3c 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -22,11 +22,14 @@ using namespace llvm;
 const uint32_t BranchProbability::D;
 
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  if (isUnknown())
+    return OS << "?%";
+
   // Get a percentage rounded to two decimal digits. This avoids
   // implementation-defined rounding inside printf.
   double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0;
-  OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent);
-  return OS;
+  return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D,
+                      Percent);
 }
 
 void BranchProbability::dump() const { print(dbgs()) << '\n'; }
@@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   }
 }
 
+BranchProbability
+BranchProbability::getBranchProbability(uint64_t Numerator,
+                                        uint64_t Denominator) {
+  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
+  // Scale down Denominator to fit in a 32-bit integer.
+  int Scale = 0;
+  while (Denominator > UINT32_MAX) {
+    Denominator >>= 1;
+    Scale++;
+  }
+  return BranchProbability(Numerator >> Scale, Denominator);
+}
+
 // If ConstD is not zero, then replace D by ConstD so that division and modulo
 // operations by D can be optimized, in case this function is not inlined by the
 // compiler.
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 3aa55b3c8850..aa3a4235d794 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -410,6 +410,7 @@ static StringRef getArchSynonym(StringRef Arch) {
       .Case("v7em", "v7e-m")
       .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
       .Case("v8.1a", "v8.1-a")
+      .Case("v8.2a", "v8.2-a")
       .Default(Arch);
 }
 
@@ -554,6 +555,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return ARM::PK_A;
   }
   return ARM::PK_INVALID;
@@ -594,6 +596,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
     return 7;
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return 8;
   }
   return 0;
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index f1f2d26b4e70..ed91c209d545 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -519,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8;
   case ARM::AK_ARMV8_1A:
     return Triple::ARMSubArch_v8_1a;
+  case ARM::AK_ARMV8_2A:
+    return Triple::ARMSubArch_v8_2a;
   default:
     return Triple::NoSubArch;
   }
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index de98d4adf996..061cdb3da216 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -86,12 +86,11 @@ static unsigned NumRegisteredSignals = 0;
 static struct {
   struct sigaction SA;
   int SigNo;
-} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])];
+} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)];
 
 
 static void RegisterHandler(int Signal) {
-  assert(NumRegisteredSignals <
-         sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
+  assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) &&
          "Out of space for signal handlers!");
 
   struct sigaction NewHandler;
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index f40ca72996a1..d109a66d7035 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -405,10 +405,7 @@ static void RegisterHandler() {
   // If we cannot load up the APIs (which would be unexpected as they should
   // exist on every version of Windows we support), we will bail out since
   // there would be nothing to report.
-  if (!load64BitDebugHelp()) {
-    assert(false && "These APIs should always be available");
-    return;
-  }
+  assert(load64BitDebugHelp() && "These APIs should always be available");
 
   if (RegisteredUnhandledExceptionFilter) {
     EnterCriticalSection(&CriticalSection);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e82cdd00ba1e..0bff9b592c15 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -38,6 +38,9 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
 def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
   "Full FP16", [FeatureFPARMv8]>;
 
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+  "Enable Statistical Profiling extension">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -88,6 +91,14 @@ include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
+
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
@@ -118,6 +129,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                               FeatureCRC,
                                               FeaturePerfMon]>;
 
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 4311198403fa..6c868880bcac 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1974,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
 // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
                                      SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+                                     unsigned &ImmS, const APInt &UsefulBits,
+                                     SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
 
   // Set Opc
@@ -1988,8 +1989,6 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
   // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
 
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
@@ -2083,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
   unsigned Opc;
   unsigned LSB, MSB;
   SDValue Opd0, Opd1;
+  EVT VT = N->getValueType(0);
+  APInt NUsefulBits;
+  getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+  // If all bits are not useful, just return UNDEF.
+  if (!NUsefulBits)
+    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
 
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+                                CurDAG))
     return nullptr;
 
-  EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[] = { Opd0,
                     Opd1,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2232e419a619..f0fb03451b2a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4380,46 +4380,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  HiBitsForLo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  HiBitsForLo, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue LoForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
 
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts larger than the register width are wrapped rather than
   // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, dl,
-                                                        MVT::i64))
-                          : DAG.getConstant(0, dl, VT);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+  SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForBigShift =
+      Opc == ISD::SRA
+          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                        DAG.getConstant(VTBits - 1, dl, MVT::i64))
+          : DAG.getConstant(0, dl, VT);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
+
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -4427,31 +4438,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  LoBitsForHi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  LoBitsForHi, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
 
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts of larger than register sizes are wrapped rather than
   // clamped, so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, dl, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 752a153c0574..5eef82153e39 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -911,6 +911,25 @@ def msr_sysreg_op : Operand<i32> {
   let PrintMethod = "printMSRSystemRegister";
 }
 
+def PSBHintOperand : AsmOperandClass {
+  let Name = "PSBHint";
+  let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+  let ParserMatchClass = PSBHintOperand;
+  let PrintMethod = "printPSBHintOp";
+  let MCOperandPredicate = [{
+    // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+    // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+    if (!MCOp.isImm())
+      return false;
+    bool ValidNamed;
+    (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
+      STI.getFeatureBits(), ValidNamed);
+    return ValidNamed;
+  }];
+}
+
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0c43003975c5..881f55ebeef9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -29,6 +29,8 @@ def HasCRC           : Predicate<"Subtarget->hasCRC()">,
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE           : Predicate<"Subtarget->hasSPE()">,
+                                 AssemblerPredicate<"FeatureSPE", "spe">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
@@ -382,6 +384,9 @@ def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
 // As far as LLVM is concerned this writes to the system's exclusive monitors.
 let mayLoad = 1, mayStore = 1 in
 def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 9aa6ef9ab670..cf94445a885c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,7 +33,7 @@ class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+  enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
@@ -47,6 +47,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
   bool HasCRC;
   bool HasPerfMon;
   bool HasFullFP16;
+  bool HasSPE;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove;
@@ -124,6 +125,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
 
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
+  bool hasSPE() const { return HasSPE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 165843fc84c9..f0ad855ed5e6 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -100,6 +100,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
   OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
   OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
@@ -159,7 +160,8 @@ class AArch64Operand : public MCParsedAsmOperand {
     k_Prefetch,
     k_ShiftExtend,
     k_FPImm,
-    k_Barrier
+    k_Barrier,
+    k_PSBHint,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -227,6 +229,12 @@ class AArch64Operand : public MCParsedAsmOperand {
     unsigned Length;
   };
 
+  struct PSBHintOp {
+    unsigned Val;
+    const char *Data;
+    unsigned Length;
+  };
+
   struct ShiftExtendOp {
     AArch64_AM::ShiftExtendType Type;
     unsigned Amount;
@@ -250,6 +258,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     struct SysRegOp SysReg;
     struct SysCRImmOp SysCRImm;
     struct PrefetchOp Prefetch;
+    struct PSBHintOp PSBHint;
     struct ShiftExtendOp ShiftExtend;
   };
 
@@ -301,6 +310,9 @@ class AArch64Operand : public MCParsedAsmOperand {
     case k_Prefetch:
       Prefetch = o.Prefetch;
       break;
+    case k_PSBHint:
+      PSBHint = o.PSBHint;
+      break;
     case k_ShiftExtend:
       ShiftExtend = o.ShiftExtend;
       break;
@@ -392,6 +404,16 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Prefetch.Val;
   }
 
+  unsigned getPSBHint() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return PSBHint.Val;
+  }
+
+  StringRef getPSBHintName() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return StringRef(PSBHint.Data, PSBHint.Length);
+  }
+
   StringRef getPrefetchName() const {
     assert(Kind == k_Prefetch && "Invalid access!");
     return StringRef(Prefetch.Data, Prefetch.Length);
@@ -961,6 +983,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   }
   bool isSysCR() const { return Kind == k_SysCR; }
   bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isPSBHint() const { return Kind == k_PSBHint; }
   bool isShiftExtend() const { return Kind == k_ShiftExtend; }
   bool isShifter() const {
     if (!isShiftExtend())
@@ -1534,6 +1557,11 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(getPrefetch()));
   }
 
+  void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPSBHint()));
+  }
+
   void addShifterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Imm =
@@ -1730,6 +1758,19 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Op;
   }
 
+  static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+    Op->PSBHint.Val = Val;
+    Op->PSBHint.Data = Str.data();
+    Op->PSBHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<AArch64Operand>
   CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
                     bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -1803,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
       OS << "<prfop invalid #" << getPrefetch() << ">";
     break;
   }
+  case k_PSBHint: {
+    OS << getPSBHintName();
+    break;
+  }
   case k_ShiftExtend: {
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
@@ -2069,6 +2114,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  auto Mapper = AArch64PSBHint::PSBHintMapper();
+  unsigned psbhint =
+      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
+  if (!Valid) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
+                                                   S, getContext()));
+  return MatchOperand_Success;
+}
+
 /// tryParseAdrpLabel - Parse and validate a source label for the ADRP
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d8937b57e490..480ed0d263ac 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -1144,6 +1144,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
     O << '#' << prfop;
 }
 
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned psbhintop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name =
+      AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << '#' << psbhintop;
+}
+
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index a94721816d33..a767aa451c6a 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -123,6 +123,9 @@ class AArch64InstPrinter : public MCInstPrinter {
   void printPrefetchOp(const MCInst *MI, unsigned OpNum,
                        const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index f657eaab8151..78f5289ec26d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -154,6 +154,14 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings
 AArch64PState::PStateMapper::PStateMapper()
   : AArch64NamedImmMapper(PStateMappings, 0) {}
 
+const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
+  // v8.2a "Statistical Profiling" extension-specific PSB operand
+  {"csync", CSync, {AArch64::FeatureSPE}},
+};
+
+AArch64PSBHint::PSBHintMapper::PSBHintMapper()
+  : AArch64NamedImmMapper(PSBHintMappings, 0) {}
+
 const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"mdccsr_el0", MDCCSR_EL0, {}},
   {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
@@ -808,6 +816,21 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
 
   // v8.2a registers
   {"uao",           UAO,           {AArch64::HasV8_2aOps}},
+
+  // v8.2a "Statistical Profiling extension" registers
+  {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
+  {"pmbptr_el1",    PMBPTR_EL1,    {AArch64::FeatureSPE}},
+  {"pmbsr_el1",     PMBSR_EL1,     {AArch64::FeatureSPE}},
+  {"pmbidr_el1",    PMBIDR_EL1,    {AArch64::FeatureSPE}},
+  {"pmscr_el2",     PMSCR_EL2,     {AArch64::FeatureSPE}},
+  {"pmscr_el12",    PMSCR_EL12,    {AArch64::FeatureSPE}},
+  {"pmscr_el1",     PMSCR_EL1,     {AArch64::FeatureSPE}},
+  {"pmsicr_el1",    PMSICR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsirr_el1",    PMSIRR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsfcr_el1",    PMSFCR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsevfr_el1",   PMSEVFR_EL1,   {AArch64::FeatureSPE}},
+  {"pmslatfr_el1",  PMSLATFR_EL1,  {AArch64::FeatureSPE}},
+  {"pmsidr_el1",    PMSIDR_EL1,    {AArch64::FeatureSPE}},
 };
 
 uint32_t
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 5a6b54bbee83..f649cb9b8a8d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -478,6 +478,21 @@ namespace AArch64PState {
 
 }
 
+namespace AArch64PSBHint {
+  enum PSBHintValues {
+    Invalid = -1,
+    // v8.2a "Statistical Profiling" extension-specific PSB operands
+    CSync = 0x11,  // psb csync = hint #0x11
+  };
+
+  struct PSBHintMapper : AArch64NamedImmMapper {
+    const static Mapping PSBHintMappings[];
+
+    PSBHintMapper();
+  };
+
+}
+
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
@@ -1199,6 +1214,21 @@ namespace AArch64SysReg {
     // v8.2a registers
     UAO               = 0xc214, // 11  000  0100  0010  100
 
+    // v8.2a "Statistical Profiling extension" registers
+    PMBLIMITR_EL1     = 0xc4d0, // 11  000  1001  1010  000
+    PMBPTR_EL1        = 0xc4d1, // 11  000  1001  1010  001
+    PMBSR_EL1         = 0xc4d3, // 11  000  1001  1010  011
+    PMBIDR_EL1        = 0xc4d7, // 11  000  1001  1010  111
+    PMSCR_EL2         = 0xe4c8, // 11  100  1001  1001  000
+    PMSCR_EL12        = 0xecc8, // 11  101  1001  1001  000
+    PMSCR_EL1         = 0xc4c8, // 11  000  1001  1001  000
+    PMSICR_EL1        = 0xc4ca, // 11  000  1001  1001  010
+    PMSIRR_EL1        = 0xc4cb, // 11  000  1001  1001  011
+    PMSFCR_EL1        = 0xc4cc, // 11  000  1001  1001  100
+    PMSEVFR_EL1       = 0xc4cd, // 11  000  1001  1001  101
+    PMSLATFR_EL1      = 0xc4ce, // 11  000  1001  1001  110
+    PMSIDR_EL1        = 0xc4cf, // 11  000  1001  1001  111
+
     // Cyclone specific system registers
     CPM_IOACC_CTL_EL3 = 0xff90,
   };
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 80766086e15c..a620e85101e6 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 314ef721c1fc..b677caa6c2c6 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -123,6 +123,48 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
+static bool isModuleLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::CommonLinkage:
+   return true;
+  case GlobalValue::ExternalLinkage:
+   return false;
+  default: llvm_unreachable("unknown linkage type");
+  }
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+      GV->isDeclaration()) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  // Group segment variables aren't emitted in HSA.
+  if (AMDGPU::isGroupSegment(GV))
+    return;
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (isModuleLinkage(GV)) {
+    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+  } else {
+    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+  }
+
+  const DataLayout &DL = getDataLayout();
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+  MCSymbol *GVSym = getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  OutStreamer->EmitLabel(GVSym);
+  EmitGlobalConstant(DL, C);
+  OutStreamer->PopSection();
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
@@ -401,6 +443,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
+  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.emitError("too many user SGPRs used");
+  }
+
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
@@ -452,18 +499,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
       S_00B848_PRIV(ProgInfo.Priv) |
       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
+  // 0 = X, 1 = XY, 2 = XYZ
+  unsigned TIDIGCompCnt = 0;
+  if (MFI->hasWorkItemIDZ())
+    TIDIGCompCnt = 2;
+  else if (MFI->hasWorkItemIDY())
+    TIDIGCompCnt = 1;
+
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
-      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
-      S_00B84C_TGID_X_EN(1) |
-      S_00B84C_TGID_Y_EN(1) |
-      S_00B84C_TGID_Z_EN(1) |
-      S_00B84C_TG_SIZE_EN(1) |
-      S_00B84C_TIDIG_COMP_CNT(2) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+      S_00B84C_EXCP_EN_MSB(0) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      S_00B84C_EXCP_EN(0);
 }
 
 static unsigned getRsrcReg(unsigned ShaderType) {
@@ -524,9 +580,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties =
-      AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
-      AMD_CODE_PROPERTY_IS_PTR64;
+  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (MFI->hasPrivateSegmentBuffer()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (MFI->hasQueuePtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+  if (MFI->hasKernargSegmentPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+  if (MFI->hasDispatchID())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+  if (MFI->hasFlatScratchInit())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  // TODO: Private segment size
+
+  if (MFI->hasGridWorkgroupCountX()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+
+  if (MFI->hasGridWorkgroupCountY()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+
+  if (MFI->hasGridWorkgroupCountZ()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
 
   if (MFI->hasDispatchPtr())
     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 687f239ecab5..1aaef00a4dd0 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -103,6 +103,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 
   void EmitFunctionEntryLabel() override;
 
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
index fa54f4a017cb..32f53edeb770 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -22,6 +23,30 @@ void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
 
   TextSection = AMDGPU::getHSATextSection(Ctx);
 
+  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+    const char *SectionName) const {
+  return cast<MCSectionELF>(DataGlobalAgentSection)
+      ->getSectionName()
+      .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+  // Read-only segments can only have agent allocation.
+  return AMDGPU::isReadOnlySegment(GV) ||
+         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+          isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+    const GlobalValue *GV) const {
+  // The default for global segments is program allocation.
+  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
 }
 
 MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
@@ -31,5 +56,16 @@ MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
   if (Kind.isText() && !GV->hasComdat())
     return getTextSection();
 
+  if (AMDGPU::isGlobalSegment(GV)) {
+    if (isAgentAllocation(GV))
+      return DataGlobalAgentSection;
+
+    if (isProgramAllocation(GV))
+      return DataGlobalProgramSection;
+  }
+
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return RodataReadonlyAgentSection;
+
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
index e1aca67b97c2..9ea51ec9b29e 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
@@ -22,6 +22,15 @@
 namespace llvm {
 
 class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF {
+private:
+  MCSection *DataGlobalAgentSection;
+  MCSection *DataGlobalProgramSection;
+  MCSection *RodataReadonlyAgentSection;
+
+  bool isAgentAllocationSection(const char *SectionName) const;
+  bool isAgentAllocation(const GlobalValue *GV) const;
+  bool isProgramAllocation(const GlobalValue *GV) const;
+
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 04a0c1d06aff..ea7c6429b7df 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
 /// \brief Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
@@ -458,41 +460,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     N = glueCopyToM0(N);
     break;
   }
-  case AMDGPUISD::REGISTER_LOAD: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-
-    SDLoc DL(N);
-    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
-    const SDValue Ops[] = {
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
-                                  CurDAG->getVTList(MVT::i32, MVT::i64,
-                                                    MVT::Other),
-                                  Ops);
-  }
-  case AMDGPUISD::REGISTER_STORE: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
-    SDLoc DL(N);
-    const SDValue Ops[] = {
-      N->getOperand(1),
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
-                                        CurDAG->getVTList(MVT::Other),
-                                        Ops);
-  }
 
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
@@ -1062,36 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
-
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1269,13 +1210,14 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
   SDLoc DL(N);
 
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+                                           "addrspacecast not implemented");
+  CurDAG->getContext()->diagnose(NotImplemented);
+
   assert(Subtarget->hasFlatAddressSpace() &&
          "addrspacecast only supported with flat address space!");
 
-  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
-         "Cannot cast address space to / from constant address!");
-
   assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
           ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
          "Can only cast to / from flat address space!");
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da663234..7c595d5a83e4 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -13,6 +13,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   ShaderType(ShaderType::COMPUTE),
   LDSSize(0),
+  ABIArgOffset(0),
   ScratchSize(0),
   IsKernel(true) {
   Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 9921630326b4..971b5179b13c 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -303,6 +303,9 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return isAmdHsaOS() ? 0 : 36;
   }
 
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4e31c7ab4d4c..7b0445db4df2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -327,7 +327,6 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
   addPass(createSIShrinkInstructionsPass(), false);
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc742b129..4afcc60984fc 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,3 +80,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                      unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69d1e68..5a94a0ba4706 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,8 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
+
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index f1b383017901..cdbd12092150 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index db5cebf6e42c..7359cfee7f27 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -332,6 +332,14 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   unsigned ForcedEncodingSize;
 
+  bool isSI() const {
+    return STI->getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+  }
+
+  bool isCI() const {
+    return STI->getFeatureBits()[AMDGPU::FeatureSeaIslands];
+  }
+
   bool isVI() const {
     return getSTI().getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
   }
@@ -357,6 +365,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool ParseSectionDirectiveHSAText();
   bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
   bool ParseDirectiveAMDGPUHsaKernel();
+  bool ParseDirectiveAMDGPUHsaModuleGlobal();
+  bool ParseDirectiveAMDGPUHsaProgramGlobal();
+  bool ParseSectionDirectiveHSADataGlobalAgent();
+  bool ParseSectionDirectiveHSADataGlobalProgram();
+  bool ParseSectionDirectiveHSARodataReadonlyAgent();
 
 public:
 public:
@@ -504,12 +517,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   const AsmToken Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
   StringRef RegName = Tok.getString();
   RegNo = getRegForName(RegName);
 
   if (RegNo) {
     Parser.Lex();
-    return false;
+    return !subtargetHasRegister(*TRI, RegNo);
   }
 
   // Match vgprs and sgprs
@@ -562,7 +577,6 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
     }
   }
 
-  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   int RCID = getRegClass(IsVgpr, RegWidth);
   if (RCID == -1)
     return true;
@@ -957,6 +971,46 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   return false;
 }
 
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalProgramSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -975,14 +1029,41 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amdgpu_hsa_kernel")
     return ParseDirectiveAMDGPUHsaKernel();
 
+  if (IDVal == ".amdgpu_hsa_module_global")
+    return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+  if (IDVal == ".amdgpu_hsa_program_global")
+    return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+  if (IDVal == ".hsadata_global_agent")
+    return ParseSectionDirectiveHSADataGlobalAgent();
+
+  if (IDVal == ".hsadata_global_program")
+    return ParseSectionDirectiveHSADataGlobalProgram();
+
+  if (IDVal == ".hsarodata_readonly_agent")
+    return ParseSectionDirectiveHSARodataReadonlyAgent();
+
   return true;
 }
 
 bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
                                            unsigned RegNo) const {
-  if (!isVI())
+  if (isCI())
     return true;
 
+  if (isSI()) {
+    // No flat_scr
+    switch (RegNo) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::FLAT_SCR_LO:
+    case AMDGPU::FLAT_SCR_HI:
+      return false;
+    default:
+      return true;
+    }
+  }
+
   // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
   // SI/CI have.
   for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7a4b5bb6d359..64c9e1882e4f 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -57,7 +57,6 @@ add_llvm_target(AMDGPUCodeGen
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
-  SIPrepareScratchRegs.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index d79ffdf52a74..68b1d1ae83cc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   InlineAsmEnd = ";#ASMEND";
 
   //===--- Data Emission Directives -------------------------------------===//
-  ZeroDirective = ".zero";
-  AsciiDirective = ".ascii\t";
-  AscizDirective = ".asciz\t";
-  Data8bitsDirective = ".byte\t";
-  Data16bitsDirective = ".short\t";
-  Data32bitsDirective = ".long\t";
-  Data64bitsDirective = ".quad\t";
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
@@ -43,6 +36,8 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
 }
 
 bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
-  return SectionName == ".hsatext" ||
+  return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+         SectionName == ".hsadata_global_program" ||
+         SectionName == ".hsarodata_readonly_agent" ||
          MCAsmInfo::shouldOmitSectionDirective(SectionName);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index c876fa906a9f..b91134d2ee9b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -231,6 +231,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
   }
 }
 
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
@@ -316,3 +326,21 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
       getStreamer().getContext().getOrCreateSymbol(SymbolName));
   Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
 }
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 30fb3014f9ee..83bb728f541c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -32,6 +32,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
 
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 };
 
 class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -48,6 +52,10 @@ class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -80,6 +88,10 @@ class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index a2d8fa1b0a10..6b3c81c3af74 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,17 +8,227 @@
 //==-----------------------------------------------------------------------===//
 
 #include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 
 using namespace llvm;
 
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+                              const MachineFrameInfo *FrameInfo) {
+  if (!FuncInfo->hasSpilledSGPRs())
+    return false;
+
+  if (FuncInfo->hasSpilledVGPRs())
+    return false;
+
+  for (int I = FrameInfo->getObjectIndexBegin(),
+         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+    if (!FrameInfo->isSpillSlotObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+                      AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  if (!MF.getFrameInfo()->hasStackObjects())
+    return;
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+  if (ST.isAmdHsaOS()) {
+    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+  }
+
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+    // We should always reserve these 5 registers at the same time.
+    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+           "scratch wave offset and private segment buffer inconsistent");
+    return;
+  }
+
+
+  // We added live-ins during argument lowering, but since they were not used
+  // they were deleted. We're adding the uses now, so add them back.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+  if (ST.isAmdHsaOS()) {
+    MRI.addLiveIn(PreloadedPrivateBufferReg);
+    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  }
+
+  // We reserved the last registers for this. Shift it down to the end of those
+  // which were actually used.
+  //
+  // FIXME: It might be safer to use a pseudoregister before replacement.
+
+  // FIXME: We should be able to eliminate unused input registers. We only
+  // cannot do this for the resources required for scratch access. For now we
+  // skip over user SGPRs and may leave unused holes.
+
+  // We find the resource first because it has an alignment requirement.
+  if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+      // Pick the first unallocated one. Make sure we don't clobber the other
+      // reserved input we needed.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg));
+        MRI.replaceRegWith(ScratchRsrcReg, Reg);
+        ScratchRsrcReg = Reg;
+        MFI->setScratchRSrcReg(ScratchRsrcReg);
+        break;
+      }
+    }
+  }
+
+  if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+    for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+      // scratch descriptor, since we haven’t added its uses yet.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg) &&
+               !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+        ScratchWaveOffsetReg = Reg;
+        MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+        break;
+      }
+    }
+  }
+
+
+  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+    // Make sure we emit the copy for the offset first. We may have chosen to copy
+    // the buffer resource into a register that aliases the input offset register.
+    BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+  }
+
+  if (ST.isAmdHsaOS()) {
+    // Insert copies from argument register.
+    assert(
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+    unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+    unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+    const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+    BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+      .addReg(Lo, RegState::Kill);
+    BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+      .addReg(Hi, RegState::Kill);
+  } else {
+    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+    // Use relocations to get the pointer, and setup the other bits manually.
+    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      .addImm(Rsrc23 & 0xffffffff)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      .addImm(Rsrc23 >> 32)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  }
+
+  // Make the register selected live throughout the function.
+  for (MachineBasicBlock &OtherBB : MF) {
+    if (&OtherBB == &MBB)
+      continue;
+
+    OtherBB.addLiveIn(ScratchRsrcReg);
+    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+  }
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  if (!MFI->hasStackObjects())
+    return;
+
   bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
 
   assert((RS || !MayNeedScavengingEmergencySlot) &&
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 677128d6ce0a..a9152fd8b2aa 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -21,6 +21,9 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override {}
 
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c67bf80c175..2cb801a707e1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -492,6 +492,17 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+    AS == AMDGPUAS::FLAT_ADDRESS ||
+    AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+}
+
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -514,7 +525,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -552,6 +563,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
   if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
     const Function *Fn = MF.getFunction();
@@ -618,53 +630,30 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-  // The pointer to the scratch buffer is stored in SGPR2, SGPR3
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs = 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    if (Subtarget->isAmdHsaOS() && MFI->hasDispatchPtr()) {
-      unsigned DispatchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
-      unsigned DispatchPtrRegLo =
-        TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-      unsigned DispatchPtrRegHi =
-        TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-      CCInfo.AllocateReg(DispatchPtrRegLo);
-      CCInfo.AllocateReg(DispatchPtrRegHi);
-      MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
-    }
-  }
-
   if (Info->getShaderType() == ShaderType::COMPUTE) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
   }
 
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -752,10 +741,113 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("work group id x is always enabled");
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg
+      = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+
+  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+  if (ST.isAmdHsaOS()) {
+    // TODO: Assume we will spill without optimizations.
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the HSA ABI, this will be the first 4 user SGPR
+      // inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("workitem id x should always be enabled");
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
   if (Chains.empty())
@@ -767,27 +859,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
-  MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH:
     return BB;
-  case AMDGPU::SI_RegisterStorePseudo: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
-                Reg);
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      MIB.addOperand(MI->getOperand(i));
-
-    MI->eraseFromParent();
-    break;
-  }
   }
   return BB;
 }
@@ -1051,6 +1127,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
                                                       // a glue result.
 }
 
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1088,37 +1176,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+    // Really only 2 bits.
+    return lowerImplicitZextParam(DAG, Op, MVT::i8,
+                                  getImplicitParameterOffset(MFI, GRID_DIM));
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
       Op.getOperand(1),
@@ -2332,15 +2419,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index a8b8ad34ed9d..b9f75cd11de0 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
 
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
@@ -77,6 +80,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
                           bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
@@ -113,10 +118,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
                            SDValue Ptr,
                            uint32_t RsrcDword1,
                            uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9a85a1d515fe..a3a2d8c01eb5 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -556,10 +556,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg)                   // src
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
     .addMemOperand(MMO);
 }
 
@@ -640,10 +638,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
     .addMemOperand(MMO);
 }
 
@@ -676,11 +672,14 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (MFI->getShaderType() == ShaderType::COMPUTE &&
         WorkGroupSize > WavefrontSize) {
 
-      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
-      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
-      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned TIDIGXReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+      unsigned TIDIGYReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+      unsigned TIDIGZReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
       unsigned InputPtrReg =
-          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
         if (!Entry.isLiveIn(Reg))
           Entry.addLiveIn(Reg);
@@ -872,20 +871,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
 
   MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Make sure it's legal to commute operands for VOP2.
-  if (isVOP2(*MI) &&
-      (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0))) {
-    return nullptr;
+
+  if (isVOP2(*MI)) {
+    const MCInstrDesc &InstrDesc = MI->getDesc();
+    // For VOP2 instructions, any operand type is valid to use for src0.  Make
+    // sure we can use the src1 as src0.
+    //
+    // We could be stricter here and only allow commuting if there is a reason
+    // to do so. i.e. if both operands are VGPRs there is no real benefit,
+    // although MachineCSE attempts to find matches by commuting.
+    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+      return nullptr;
   }
 
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
     if (NewMI || !Src1.isImm() ||
-       (!isVOP2(*MI) && !isVOP3(*MI))) {
+        (!isVOP2(*MI) && !isVOP3(*MI))) {
       return nullptr;
     }
-
     // Be sure to copy the source modifiers to the right place.
     if (MachineOperand *Src0Mods
           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -1721,6 +1726,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   Inst->addOperand(Op1);
 }
 
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+                                    const MCOperandInfo &OpInfo,
+                                    const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  const TargetRegisterClass *RC =
+    TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    RI.getPhysRegClass(Reg);
+
+  // In order to be legal, the common sub-class must be equal to the
+  // class of the current operand.  For example:
+  //
+  // v_mov_b32 s0 ; Operand defined as vsrc_32
+  //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+  //
+  // s_sendmsg 0, s0 ; Operand defined as m0reg
+  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                                     const MCOperandInfo &OpInfo,
+                                     const MachineOperand &MO) const {
+  if (MO.isReg())
+    return isLegalRegOperand(MRI, OpInfo, MO);
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  return true;
+}
+
 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1748,21 +1788,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC =
-        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
-            MRI.getRegClass(MO->getReg()) :
-            RI.getPhysRegClass(MO->getReg());
-
-    // In order to be legal, the common sub-class must be equal to the
-    // class of the current operand.  For example:
-    //
-    // v_mov_b32 s0 ; Operand defined as vsrc_32
-    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
-    //
-    // s_sendmsg 0, s0 ; Operand defined as m0reg
-    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
-    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+    return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
 
@@ -1777,6 +1803,81 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                                       MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &InstrDesc = get(Opc);
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+  // we need to only have one constant bus use.
+  //
+  // Note we do not need to worry about literal constants here. They are
+  // disabled for the operand type for instructions because they will always
+  // violate the one constant bus use rule.
+  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  if (HasImplicitSGPR) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
+      legalizeOpWithMove(MI, Src0Idx);
+  }
+
+  // VOP2 src0 instructions support all operand types, so we don't need to check
+  // their legality. If src1 is already legal, we don't need to do anything.
+  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+    return;
+
+  // We do not use commuteInstruction here because it is too aggressive and will
+  // commute if it is possible. We only want to commute here if it improves
+  // legality. This can be called a fairly large number of times so don't waste
+  // compile time pointlessly swapping and checking legality again.
+  if (HasImplicitSGPR || !MI->isCommutable()) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+  // If src0 can be used as src1, commuting will make the operands legal.
+  // Otherwise we have to give up and insert a move.
+  //
+  // TODO: Other immediate-like operand kinds could be commuted if there was a
+  // MachineOperand::ChangeTo* for them.
+  if ((!Src1.isImm() && !Src1.isReg()) ||
+      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int CommutedOpc = commuteOpcode(*MI);
+  if (CommutedOpc == -1) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  MI->setDesc(get(CommutedOpc));
+
+  unsigned Src0Reg = Src0.getReg();
+  unsigned Src0SubReg = Src0.getSubReg();
+  bool Src0Kill = Src0.isKill();
+
+  if (Src1.isImm())
+    Src0.ChangeToImmediate(Src1.getImm());
+  else if (Src1.isReg()) {
+    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+    Src0.setSubReg(Src1.getSubReg());
+  } else
+    llvm_unreachable("Should only have register or immediate operands");
+
+  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+  Src1.setSubReg(Src0SubReg);
+}
+
 // Legalize VOP3 operands. Because all operand types are supported for any
 // operand, and since literal constants are not allowed and should never be
 // seen, we only need to worry about inserting copies if we use multiple SGPR
@@ -1822,32 +1923,10 @@ void SIInstrInfo::legalizeOperandsVOP3(
 
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  unsigned Opc = MI->getOpcode();
 
   // Legalize VOP2
   if (isVOP2(*MI)) {
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-
-    // Legalize src0
-    if (!isOperandLegal(MI, Src0Idx))
-      legalizeOpWithMove(MI, Src0Idx);
-
-    // Legalize src1
-    if (isOperandLegal(MI, Src1Idx))
-      return;
-
-    // Usually src0 of VOP2 instructions allow more types of inputs
-    // than src1, so try to commute the instruction to decrease our
-    // chances of having to insert a MOV instruction to legalize src1.
-    if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
-        // If we are successful in commuting, then we know MI is legal, so
-        // we are done.
-        return;
-    }
-
-    legalizeOpWithMove(MI, Src1Idx);
+    legalizeOperandsVOP2(MRI, MI);
     return;
   }
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 2bce87f3bd06..8d18d29196f7 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -381,6 +381,23 @@ class SIInstrInfo : public AMDGPUInstrInfo {
   bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
+  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// definition \p OpInfo. Note this does not attempt to validate constant bus
+  /// restrictions (e.g. literal constant usage).
+  bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                          const MCOperandInfo &OpInfo,
+                          const MachineOperand &MO) const;
+
+  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// given operand description.
+  bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+                         const MCOperandInfo &OpInfo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// copy of src1.
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
   /// \brief Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bc2b0c6c07fb..2cee993d751c 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1942,36 +1942,6 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterLoad = 1;
-  let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
-  outs,
-  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterStore = 1;
-  let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
 class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
   (outs VGPR_32:$dst, SReg_64:$temp),
   (ins rc:$src, VSrc_32:$idx, i32imm:$off),
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6269dce553f6..935aad427198 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
     ScratchRSrcReg(AMDGPU::NoRegister),
+    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+    DispatchPtrUserSGPR(AMDGPU::NoRegister),
+    QueuePtrUserSGPR(AMDGPU::NoRegister),
+    KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+    DispatchIDUserSGPR(AMDGPU::NoRegister),
+    FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+    PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+    WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+    PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     LDSWaveSpillSize(0),
     PSInputAddr(0),
     NumUserSGPRs(0),
+    NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
+    PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
     DispatchID(false),
-    KernargSegmentPtr(true),
+    KernargSegmentPtr(false),
     FlatScratchInit(false),
     GridWorkgroupCountX(false),
     GridWorkgroupCountY(false),
@@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
+    PrivateSegmentWaveByteOffset(false),
     WorkItemIDX(true),
     WorkItemIDY(false),
     WorkItemIDZ(false) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const Function *F = MF.getFunction();
 
-  if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
-    DispatchPtr = true;
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+  if (getShaderType() == ShaderType::COMPUTE)
+    KernargSegmentPtr = true;
 
   if (F->hasFnAttribute("amdgpu-work-group-id-y"))
     WorkGroupIDY = true;
@@ -66,6 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   if (F->hasFnAttribute("amdgpu-work-item-id-z"))
     WorkItemIDZ = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  bool HasStackObjects = FrameInfo->hasStackObjects();
+
+  if (HasStackObjects || MaySpill)
+    PrivateSegmentWaveByteOffset = true;
+
+  if (ST.isAmdHsaOS()) {
+    if (HasStackObjects || MaySpill)
+      PrivateSegmentBuffer = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+      DispatchPtr = true;
+  }
+
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+  const SIRegisterInfo &TRI) {
+  PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  NumUserSGPRs += 4;
+  return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+  DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+  QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+  KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return KernargSegmentPtrUserSGPR;
 }
 
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 377c5ce94846..9c528d63bd0e 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,10 +26,36 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  // FIXME: This should be removed and getPreloadedValue moved here.
+  friend struct SIRegisterInfo;
   void anchor() override;
 
   unsigned TIDReg;
+
+  // Registers that may be reserved for spilling purposes. These may be the same
+  // as the input registers.
   unsigned ScratchRSrcReg;
+  unsigned ScratchWaveOffsetReg;
+
+  // Input registers setup for the HSA ABI.
+  // User SGPRs in allocation order.
+  unsigned PrivateSegmentBufferUserSGPR;
+  unsigned DispatchPtrUserSGPR;
+  unsigned QueuePtrUserSGPR;
+  unsigned KernargSegmentPtrUserSGPR;
+  unsigned DispatchIDUserSGPR;
+  unsigned FlatScratchInitUserSGPR;
+  unsigned PrivateSegmentSizeUserSGPR;
+  unsigned GridWorkGroupCountXUserSGPR;
+  unsigned GridWorkGroupCountYUserSGPR;
+  unsigned GridWorkGroupCountZUserSGPR;
+
+  // System SGPRs in allocation order.
+  unsigned WorkGroupIDXSystemSGPR;
+  unsigned WorkGroupIDYSystemSGPR;
+  unsigned WorkGroupIDZSystemSGPR;
+  unsigned WorkGroupInfoSystemSGPR;
+  unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
 public:
   // FIXME: Make private
@@ -38,12 +64,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
+  unsigned NumSystemSGPRs;
 
 private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
 
-  // Feature bits required for inputs passed in user / system SGPRs.
+  // Feature bits required for inputs passed in user SGPRs.
+  bool PrivateSegmentBuffer : 1;
   bool DispatchPtr : 1;
   bool QueuePtr : 1;
   bool DispatchID : 1;
@@ -53,15 +81,27 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   bool GridWorkgroupCountY : 1;
   bool GridWorkgroupCountZ : 1;
 
+  // Feature bits required for inputs passed in system SGPRs.
   bool WorkGroupIDX : 1; // Always initialized.
   bool WorkGroupIDY : 1;
   bool WorkGroupIDZ : 1;
   bool WorkGroupInfo : 1;
+  bool PrivateSegmentWaveByteOffset : 1;
 
   bool WorkItemIDX : 1; // Always initialized.
   bool WorkItemIDY : 1;
   bool WorkItemIDZ : 1;
 
+
+  MCPhysReg getNextUserSGPR() const {
+    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+    return AMDGPU::SGPR0 + NumUserSGPRs;
+  }
+
+  MCPhysReg getNextSystemSGPR() const {
+    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+  }
+
 public:
   struct SpilledReg {
     unsigned VGPR;
@@ -80,6 +120,47 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
 
+  // Add user SGPRs.
+  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+  unsigned addQueuePtr(const SIRegisterInfo &TRI);
+  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+  // Add system SGPRs.
+  unsigned addWorkGroupIDX() {
+    WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDXSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDY() {
+    WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDYSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDZ() {
+    WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDZSystemSGPR;
+  }
+
+  unsigned addWorkGroupInfo() {
+    WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupInfoSystemSGPR;
+  }
+
+  unsigned addPrivateSegmentWaveByteOffset() {
+    PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  bool hasPrivateSegmentBuffer() const {
+    return PrivateSegmentBuffer;
+  }
+
   bool hasDispatchPtr() const {
     return DispatchPtr;
   }
@@ -128,6 +209,10 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
     return WorkGroupInfo;
   }
 
+  bool hasPrivateSegmentWaveByteOffset() const {
+    return PrivateSegmentWaveByteOffset;
+  }
+
   bool hasWorkItemIDX() const {
     return WorkItemIDX;
   }
@@ -140,13 +225,37 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
     return WorkItemIDZ;
   }
 
+  unsigned getNumUserSGPRs() const {
+    return NumUserSGPRs;
+  }
+
+  unsigned getNumPreloadedSGPRs() const {
+    return NumUserSGPRs + NumSystemSGPRs;
+  }
+
+  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
   /// \brief Returns the physical register reserved for use as the resource
   /// descriptor for scratch accesses.
   unsigned getScratchRSrcReg() const {
     return ScratchRSrcReg;
   }
 
-  void setScratchRSrcReg(const SIRegisterInfo *TRI);
+  void setScratchRSrcReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchRSrcReg = Reg;
+  }
+
+  unsigned getScratchWaveOffsetReg() const {
+    return ScratchWaveOffsetReg;
+  }
+
+  void setScratchWaveOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchWaveOffsetReg = Reg;
+  }
 
   bool hasSpilledSGPRs() const {
     return HasSpilledSGPRs;
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index a6c22775e098..000000000000
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc23 & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc23 >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ab7539b6fb3a..bf87f0225272 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
     Reserved.set(*R);
 }
 
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+    // next sgpr128 down.
+    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+  }
+
+  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Next register before reservations for flat_scr and vcc.
+    return AMDGPU::SGPR97;
+  }
+
+  return AMDGPU::SGPR95;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -68,6 +102,23 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+  }
+
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+  }
+
   return Reserved;
 }
 
@@ -188,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
-    bool IsKill = (i == e - 1);
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
       .addReg(SubReg, getDefRegState(IsLoad))
-      .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+      .addReg(ScratchRsrcReg)
       .addReg(SOffset)
       .addImm(Offset)
       .addImm(0) // glc
@@ -243,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
+        // FIXME: Since this spills to another register instead of an actual
+        // frame index, we should delete the frame index when all references to
+        // it are fixed.
       }
       MI->eraseFromParent();
       break;
@@ -507,36 +560,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
   return OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  (void)ST;
   switch (Value) {
-  case SIRegisterInfo::TGID_X:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
-  case SIRegisterInfo::TGID_Y:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
-  case SIRegisterInfo::TGID_Z:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
-  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
-    if (MFI->getShaderType() != ShaderType::COMPUTE)
-      return MFI->ScratchOffsetReg;
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
-  case SIRegisterInfo::SCRATCH_PTR:
-    return AMDGPU::SGPR2_SGPR3;
-  case SIRegisterInfo::INPUT_PTR:
-    if (ST.isAmdHsaOS())
-      return MFI->hasDispatchPtr() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
-    return AMDGPU::SGPR0_SGPR1;
+  case SIRegisterInfo::WORKGROUP_ID_X:
+    assert(MFI->hasWorkGroupIDX());
+    return MFI->WorkGroupIDXSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Y:
+    assert(MFI->hasWorkGroupIDY());
+    return MFI->WorkGroupIDYSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Z:
+    assert(MFI->hasWorkGroupIDZ());
+    return MFI->WorkGroupIDZSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+    return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+    assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+    assert(MFI->hasKernargSegmentPtr());
+    return MFI->KernargSegmentPtrUserSGPR;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
-    return AMDGPU::SGPR0_SGPR1;
-  case SIRegisterInfo::TIDIG_X:
+    return MFI->DispatchPtrUserSGPR;
+  case SIRegisterInfo::QUEUE_PTR:
+    llvm_unreachable("not implemented");
+  case SIRegisterInfo::WORKITEM_ID_X:
+    assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
-  case SIRegisterInfo::TIDIG_Y:
+  case SIRegisterInfo::WORKITEM_ID_Y:
+    assert(MFI->hasWorkItemIDY());
     return AMDGPU::VGPR1;
-  case SIRegisterInfo::TIDIG_Z:
+  case SIRegisterInfo::WORKITEM_ID_Z:
+    assert(MFI->hasWorkItemIDZ());
     return AMDGPU::VGPR2;
   }
   llvm_unreachable("unexpected preloaded value type");
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 36f6d1c7a261..1795237c2140 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,6 +18,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
@@ -29,6 +30,15 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 public:
   SIRegisterInfo();
 
+  /// Return the end register initially reserved for the scratch buffer in case
+  /// spilling is needed.
+  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+  /// Return the end register initially reserved for the scratch wave offset in
+  /// case spilling is needed.
+  unsigned reservedPrivateSegmentWaveByteOffsetReg(
+    const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
@@ -56,6 +66,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
     return isSGPRClass(getRegClass(RCID));
   }
 
+  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return isSGPRClass(MRI.getRegClass(Reg));
+    return getPhysRegClass(Reg);
+  }
+
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
 
@@ -93,23 +109,25 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
-  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
   bool opCanUseInlineConstant(unsigned OpType) const;
 
   enum PreloadedValue {
     // SGPRS:
-    SCRATCH_PTR         =  0,
+    PRIVATE_SEGMENT_BUFFER =  0,
     DISPATCH_PTR        =  1,
-    INPUT_PTR           =  3,
-    TGID_X              = 10,
-    TGID_Y              = 11,
-    TGID_Z              = 12,
-    SCRATCH_WAVE_OFFSET = 14,
+    QUEUE_PTR           =  2,
+    KERNARG_SEGMENT_PTR =  3,
+    WORKGROUP_ID_X      = 10,
+    WORKGROUP_ID_Y      = 11,
+    WORKGROUP_ID_Z      = 12,
+    PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
     // VGPRS:
     FIRST_VGPR_VALUE    = 15,
-    TIDIG_X             = FIRST_VGPR_VALUE,
-    TIDIG_Y             = 16,
-    TIDIG_Z             = 17,
+    WORKITEM_ID_X       = FIRST_VGPR_VALUE,
+    WORKITEM_ID_Y       = 16,
+    WORKITEM_ID_Z       = 17
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e70f79d5a7be..441baed9b434 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -66,5 +68,36 @@ MCSection *getHSATextSection(MCContext &Ctx) {
                            ELF::SHF_AMDGPU_HSA_CODE);
 }
 
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_AMDGPU_HSA_GLOBAL |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                            ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
 } // End namespace AMDGPU
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 59a32a6b592d..7b3c858e7c36 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,7 @@
 namespace llvm {
 
 class FeatureBitset;
+class GlobalValue;
 class MCContext;
 class MCSection;
 
@@ -31,6 +32,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 MCSection *getHSATextSection(MCContext &Ctx);
 
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ceb48d83cd84..dd33c3614b1a 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -62,6 +62,9 @@ def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
                                    "true", "Enable ARMv8 FP",
                                    [FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                       "Enable full half-precision floating point",
+                                       [FeatureFPARMv8]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
                                      "Restrict FP to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
@@ -212,6 +215,9 @@ def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
+def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+                                   "Support ARM v8.2a instructions",
+                                   [HasV8_1aOps]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -232,6 +238,8 @@ def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors", []>;
 def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
                                    "Cortex-A17 ARM processors", []>;
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors", []>;
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", []>;
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
@@ -354,6 +362,18 @@ def ARMv81a   : Architecture<"armv8.1-a", "ARMv81a",  [HasV8_1aOps,
                                                        FeatureCrypto,
                                                        FeatureCRC]>;
 
+def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
 // Aliases
 def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
 def IWMMXT2  : Architecture<"iwmmxt2",     "ARMv5te",  [ARMv5te]>;
@@ -585,6 +605,13 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureD16]>;
 
 
+def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 0bf2d374df6a..e89757c19ecc 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
-  JTBB->removeSuccessor(BB);
-  JTBB->addSuccessor(NewBB);
+  JTBB->replaceSuccessor(BB, NewBB);
 
   ++NumJTInserted;
   return NewBB;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 5b3229456317..c5990bb7d1fb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1889,10 +1889,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // first in the list.
   MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
                                       PostStackMBB};
-  const int NbAddedBlocks = sizeof(AddedBlocks) / sizeof(AddedBlocks[0]);
 
-  for (int Idx = 0; Idx < NbAddedBlocks; ++Idx)
-    BeforePrologueRegion.insert(AddedBlocks[Idx]);
+  for (MachineBasicBlock *B : AddedBlocks)
+    BeforePrologueRegion.insert(B);
 
   for (const auto &LI : PrologueMBB.liveins()) {
     for (MachineBasicBlock *PredBB : BeforePrologueRegion)
@@ -1901,9 +1900,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Remove the newly added blocks from the list, since we know
   // we do not have to do the following updates for them.
-  for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) {
-    BeforePrologueRegion.erase(AddedBlocks[Idx]);
-    MF.insert(PrologueMBB.getIterator(), AddedBlocks[Idx]);
+  for (MachineBasicBlock *B : AddedBlocks) {
+    BeforePrologueRegion.erase(B);
+    MF.insert(PrologueMBB.getIterator(), B);
   }
 
   for (MachineBasicBlock *MBB : BeforePrologueRegion) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0cc41812d71c..33f74a3ba9fd 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       }
     }
 
-    BB->addSuccessor(DispatchBB);
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
@@ -11891,7 +11891,7 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
                                                   : AtomicExpansionKind::None;
 }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 238dc338d141..4c7107aee6a2 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -215,6 +215,8 @@ def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
                                  AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
                                  AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -234,7 +236,9 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16","half-float">;
+                                 AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16","full half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 9e3cd36d49ef..bb6ae28065bd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -112,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasV7Ops = false;
   HasV8Ops = false;
   HasV8_1aOps = false;
+  HasV8_2aOps = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
@@ -130,6 +131,7 @@ void ARMSubtarget::initializeEnvironment() {
   NoMovt = false;
   SupportsTailCall = false;
   HasFP16 = false;
+  HasFullFP16 = false;
   HasD16 = false;
   HasHardwareDivide = false;
   HasHardwareDivideInARM = false;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c194149e8452..a8b28018f1b2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -43,8 +43,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA53, CortexA57,
-    CortexA72, Krait, Swift
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
+    CortexA57, CortexA72, Krait, Swift
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
@@ -52,7 +52,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   enum ARMArchEnum {
     ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
     ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
@@ -77,6 +77,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool HasV7Ops;
   bool HasV8Ops;
   bool HasV8_1aOps;
+  bool HasV8_2aOps;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
@@ -130,10 +131,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// Thumb.
   bool SupportsTailCall;
 
-  /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
-  /// only so far)
+  /// HasFP16 - True if subtarget supports half-precision FP conversions
   bool HasFP16;
 
+  /// HasFullFP16 - True if subtarget supports half-precision FP operations
+  bool HasFullFP16;
+
   /// HasD16 - True if subtarget is limited to 16 double precision
   /// FP registers for VFPv3.
   bool HasD16;
@@ -309,6 +312,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool hasV7Ops()   const { return HasV7Ops;  }
   bool hasV8Ops()   const { return HasV8Ops;  }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA7() const { return ARMProcFamily == CortexA7; }
@@ -362,6 +366,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
+  bool hasFullFP16() const { return HasFullFP16; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ba144458386d..73f330877566 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -9916,7 +9916,7 @@ extern "C" void LLVMInitializeARMAsmParser() {
 // flags below, that were generated by table-gen.
 static const struct {
   const unsigned Kind;
-  const unsigned ArchCheck;
+  const uint64_t ArchCheck;
   const FeatureBitset Features;
 } Extensions[] = {
   { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
@@ -9930,6 +9930,7 @@ static const struct {
   { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
   // FIXME: Only available in A-class, isel not predicated
   { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+  { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
   // FIXME: Unsupported extensions.
   { ARM::AEK_OS, Feature_None, {} },
   { ARM::AEK_IWMMXT, Feature_None, {} },
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 7a177f87231d..3e7da07b4aad 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -750,6 +750,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
 
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index f5d4cb8a3ca1..fd96af6cb6e0 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -406,11 +406,15 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
   if (AFI->getArgRegsSaveSize())
     return true;
 
-  bool IsV4PopReturn = false;
+  // FIXME: this doesn't make sense, and the following patch will remove it.
+  if (!STI.hasV4TOps()) return false;
+
+  // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
   for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
     if (CSI.getReg() == ARM::LR)
-      IsV4PopReturn = true;
-  return IsV4PopReturn && STI.hasV4TOps() && !STI.hasV5TOps();
+      return true;
+
+  return false;
 }
 
 bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
@@ -422,22 +426,42 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   const ThumbRegisterInfo *RegInfo =
       static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
 
-  // If MBBI is a return instruction, we may be able to directly restore
+  // If MBBI is a return instruction, or is a tPOP followed by a return
+  // instruction in the successor BB, we may be able to directly restore
   // LR in the PC.
-  // This is possible if we do not need to emit any SP update.
+  // This is only possible with v5T ops (v4T can't change the Thumb bit via
+  // a POP PC instruction), and only if we do not need to emit any SP update.
   // Otherwise, we need a temporary register to pop the value
   // and copy that value into LR.
   auto MBBI = MBB.getFirstTerminator();
-  if (!ArgRegsSaveSize && MBBI != MBB.end() &&
-      MBBI->getOpcode() == ARM::tBX_RET) {
-    if (!DoIt)
+  bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+  if (CanRestoreDirectly) {
+    if (MBBI != MBB.end())
+      CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+                            MBBI->getOpcode() == ARM::tPOP_RET);
+    else {
+      assert(MBB.back().getOpcode() == ARM::tPOP);
+      assert(MBB.succ_size() == 1);
+      if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+        MBBI--; // Replace the final tPOP with a tPOP_RET.
+      else
+        CanRestoreDirectly = false;
+    }
+  }
+
+  if (CanRestoreDirectly) {
+    if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
       return true;
     MachineInstrBuilder MIB =
         AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)))
-            .addReg(ARM::PC, RegState::Define);
-    MIB.copyImplicitOps(&*MBBI);
-    // erase the old tBX_RET instruction
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+    // Copy implicit ops and popped registers, if any.
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::LR)
+        MIB.addOperand(MO);
+    MIB.addReg(ARM::PC, RegState::Define);
+    // Erase the old instruction (tBX_RET or tPOP).
     MBB.erase(MBBI);
     return true;
   }
@@ -459,10 +483,10 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   if (MBBI != MBB.end()) {
     dl = MBBI->getDebugLoc();
     auto InstUpToMBBI = MBB.end();
-    // The post-decrement is on purpose here.
-    // We want to have the liveness right before MBBI.
-    while (InstUpToMBBI-- != MBBI)
-      UsedRegs.stepBackward(*InstUpToMBBI);
+    while (InstUpToMBBI != MBBI)
+      // The pre-decrement is on purpose here.
+      // We want to have the liveness right before MBBI.
+      UsedRegs.stepBackward(*--InstUpToMBBI);
   }
 
   // Look for a register that can be directly use in the POP.
@@ -508,6 +532,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
                        .addReg(PopReg, RegState::Kill));
   }
 
+  if (MBBI == MBB.end()) {
+    MachineInstr& Pop = MBB.back();
+    assert(Pop.getOpcode() == ARM::tPOP);
+    Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR));
+  }
+
   assert(PopReg && "Do not know how to get LR");
   AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
       .addReg(PopReg, RegState::Define);
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index b73ec0bfb191..5e78762b994a 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -1511,14 +1512,14 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 }
 
 void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
-  std::stringstream errStr;
-  errStr << "value " << Val << "(0x" << std::hex << Val << std::dec
-         << ") out of range: ";
+  std::string errStr;
+  raw_string_ostream ES(errStr);
+  ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
   if (Max >= 0)
-    errStr << "0-" << Max;
+    ES << "0-" << Max;
   else
-    errStr << Max << "-" << (-Max - 1);
-  Error(IDLoc, errStr.str().c_str());
+    ES << Max << "-" << (-Max - 1);
+  Error(IDLoc, ES.str().c_str());
 }
 
 int HexagonAsmParser::processInstruction(MCInst &Inst,
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 4036650bf74b..1db59e1dd99d 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -471,12 +471,13 @@ extern const MCInstrDesc HexagonInsts[];
 }
 
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        const uint16_t Table[], size_t Size) {
-  if (RegNo < Size) {
+                                        ArrayRef<uint16_t> Table) {
+  if (RegNo < Table.size()) {
     Inst.addOperand(MCOperand::createReg(Table[RegNo]));
     return MCDisassembler::Success;
-  } else
-    return MCDisassembler::Fail;
+  }
+
+  return MCDisassembler::Fail;
 }
 
 static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
@@ -497,8 +498,7 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
       Hexagon::R30, Hexagon::R31};
 
-  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable,
-                              sizeof(IntRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable));
 }
 
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -513,8 +513,7 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
       Hexagon::V30, Hexagon::V31};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable,
-                              sizeof(VecRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable));
 }
 
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -526,8 +525,7 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
       Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable,
-                              sizeof(DoubleRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable));
 }
 
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -539,8 +537,7 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
       Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable,
-                              sizeof(VecDblRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
 }
 
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -549,8 +546,7 @@ static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
                                                  Hexagon::P2, Hexagon::P3};
 
-  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable,
-                              sizeof(PredRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable));
 }
 
 static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -559,8 +555,7 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
                                                     Hexagon::Q2, Hexagon::Q3};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable,
-                              sizeof(VecPredRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable));
 }
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -573,7 +568,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
     Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
   };
 
-  if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
+  if (RegNo >= array_lengthof(CtrlRegDecoderTable))
     return MCDisassembler::Fail;
 
   if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
@@ -597,7 +592,7 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::UPC,    Hexagon::NoRegister
   };
 
-  if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
+  if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
     return MCDisassembler::Fail;
 
   if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
@@ -784,7 +779,7 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
 // Please note that the instructions must be ordered in the descending order
 // of their opcode.
 // HexagonII::INST_ICLASS_ST
-static unsigned int StoreConditionalOpcodeData[][2] = {
+static const unsigned int StoreConditionalOpcodeData[][2] = {
     {S4_pstorerdfnew_abs, 0xafc02084},
     {S4_pstorerdtnew_abs, 0xafc02080},
     {S4_pstorerdf_abs, 0xafc00084},
@@ -830,18 +825,16 @@ static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000},
                                                 {S2_storerfabs, 0x48600000},
                                                 {S2_storerhabs, 0x48400000},
                                                 {S2_storerbabs, 0x48000000}};
-static int NumCondS =
-    sizeof(StoreConditionalOpcodeData) / sizeof(StoreConditionalOpcodeData[0]);
-static int NumLS = sizeof(LoadStoreOpcodeData) / sizeof(LoadStoreOpcodeData[0]);
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
 
 static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
 
   unsigned MachineOpcode = 0;
   unsigned LLVMOpcode = 0;
-  int i;
 
   if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
-    for (i = 0; i < NumCondS; ++i) {
+    for (size_t i = 0; i < NumCondS; ++i) {
       if ((insn & StoreConditionalOpcodeData[i][1]) ==
           StoreConditionalOpcodeData[i][1]) {
         MachineOpcode = StoreConditionalOpcodeData[i][1];
@@ -851,7 +844,7 @@ static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
     }
   }
   if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
-    for (i = 0; i < NumLS; ++i) {
+    for (size_t i = 0; i < NumLS; ++i) {
       if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
         MachineOpcode = LoadStoreOpcodeData[i][1];
         LLVMOpcode = LoadStoreOpcodeData[i][0];
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index d360be2aa5b2..ed7d9578902e 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -47,15 +47,8 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-  class MachineInstr;
-  class MCInst;
-  class MCInstrInfo;
-  class HexagonAsmPrinter;
   class HexagonTargetMachine;
 
-  void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
-                        HexagonAsmPrinter &AP);
-
   /// \brief Creates a Hexagon-specific Target Transformation Info pass.
   ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
 } // end namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 530dab859d5f..19769258ee89 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -56,6 +56,11 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
 #define DEBUG_TYPE "asm-printer"
 
 static cl::opt<bool> AlignCalls(
@@ -179,6 +184,7 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst MCB = HexagonMCInstrInfo::createBundle();
+  const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
 
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
@@ -190,25 +196,23 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
           MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
         ++IgnoreCount;
       else {
-        HexagonLowerToMC(&*MII, MCB, *this);
+        HexagonLowerToMC(MCII, &*MII, MCB, *this);
       }
     }
   }
   else {
-    HexagonLowerToMC(MI, MCB, *this);
+    HexagonLowerToMC(MCII, MI, MCB, *this);
     HexagonMCInstrInfo::padEndloop(OutStreamer->getContext(), MCB);
   }
   // Examine the packet and try to find instructions that can be converted
   // to compounds.
-  HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(),
-                                  OutStreamer->getContext(), MCB);
+  HexagonMCInstrInfo::tryCompound(MCII, OutStreamer->getContext(), MCB);
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
   SmallVector<DuplexCandidate, 8> possibleDuplexes;
-  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(
-      *Subtarget->getInstrInfo(), MCB);
-  HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget,
-                   OutStreamer->getContext(), MCB, possibleDuplexes);
+  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+  HexagonMCShuffle(MCII, *Subtarget, OutStreamer->getContext(), MCB,
+                   possibleDuplexes);
   EmitToStreamer(*OutStreamer, MCB);
 }
 
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 96bb61750805..efafdd007289 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->removeSuccessor(JumpAroundTarget);
-              MBB->addSuccessor(UncondTarget);
+              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
-              LayoutSucc->removeSuccessor(UncondTarget);
-              LayoutSucc->addSuccessor(JumpAroundTarget);
+              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
 
               // This code performs the conversion for case 2, which moves
               // the block to the fall-thru case (BB3 in the code above).
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 969edf6d5572..04f5b6649293 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2863,7 +2863,7 @@ TargetLowering::AtomicExpansionKind
 HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
   return LI->getType()->getPrimitiveSizeInBits() > 64
-             ? AtomicExpansionKind::LLSC
+             ? AtomicExpansionKind::LLOnly
              : AtomicExpansionKind::None;
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 65612c590bfe..7389a40f4a4c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -3864,26 +3864,6 @@ let AddedComplexity = 100 in {
   def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
 }
 
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
-let AddedComplexity = 100 in
-def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>;
-
-// Transfer global address into a register
-let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
-isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
-def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
-           "$dst = #$src1",
-           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>;
-
-// Transfer a block address into a register
-def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
-          (TFRI_V4 tblockaddress:$src1)>;
-
-let AddedComplexity = 50 in
-def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>;
-
 // i8/i16/i32 -> i64 loads
 // We need a complexity of 120 here to override preceding handling of
 // zextload.
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 82a9b23149c4..86d9e19c0547 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -26,39 +26,71 @@
 
 using namespace llvm;
 
-static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
-                              HexagonAsmPrinter& Printer) {
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              HexagonAsmPrinter &Printer) {
   MCContext &MC = Printer.OutContext;
   const MCExpr *ME;
 
-  ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC);
+  // Populate the relocation type based on Hexagon target flags
+  // set on an operand
+  MCSymbolRefExpr::VariantKind RelocationType;
+  switch (MO.getTargetFlags()) {
+  default:
+    RelocationType = MCSymbolRefExpr::VK_None;
+    break;
+  case HexagonII::MO_PCREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+    break;
+  case HexagonII::MO_GOT:
+    RelocationType = MCSymbolRefExpr::VK_GOT;
+    break;
+  case HexagonII::MO_LO16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+    break;
+  case HexagonII::MO_HI16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+    break;
+  case HexagonII::MO_GPREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+    break;
+  }
+
+  ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
 
   if (!MO.isJTI() && MO.getOffset())
     ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
                                  MC);
 
-  return (MCOperand::createExpr(ME));
+  return MCOperand::createExpr(ME);
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
-                            HexagonAsmPrinter& AP) {
-  if(MI->getOpcode() == Hexagon::ENDLOOP0){
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                            MCInst &MCB, HexagonAsmPrinter &AP) {
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
     HexagonMCInstrInfo::setInnerLoop(MCB);
     return;
   }
-  if(MI->getOpcode() == Hexagon::ENDLOOP1){
+  if (MI->getOpcode() == Hexagon::ENDLOOP1) {
     HexagonMCInstrInfo::setOuterLoop(MCB);
     return;
   }
-  MCInst* MCI = new (AP.OutContext) MCInst;
+  MCInst *MCI = new (AP.OutContext) MCInst;
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
+  bool MustExtend = false;
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
     MCOperand MCO;
+    if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
+      MustExtend = true;
 
     switch (MO.getType()) {
     default:
@@ -107,5 +139,7 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
 
     MCI->addOperand(MCO);
   }
+  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI,
+                                     MustExtend);
   MCB.addOperand(MCOperand::createInst(MCI));
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 2f3521bfd717..b73af8249cb5 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -13,12 +13,12 @@
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 34817cd98f2f..e6194f61a6ba 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -81,8 +81,7 @@ static const std::pair<unsigned, unsigned> opcodeData[] = {
     std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
 
 static std::map<unsigned, unsigned>
-    subinstOpcodeMap(opcodeData,
-                     opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0]));
+    subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData));
 
 bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
   switch (Ga) {
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index f9601839b44c..716a96e2c46b 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder);
 
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 // DecodeJumpTargetMM - Decode microMIPS jump target, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
@@ -1863,6 +1870,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+  unsigned Offset,
+  uint64_t Address,
+  const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 86a5d5882184..ed917a4daba3 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -350,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+    const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  // TODO: Push 26 PC fixup.
+  return 0;
+}
+
 /// getJumpOffset16OpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index c2f4b6a72bbf..eb48914b0649 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -137,6 +137,13 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget26OpValueMM - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
   // getJumpOffset16OpValue - Return binary encoding of the jump
   // offset operand. If the machine operand requires relocation,
   // record the relocation and return zero.
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index c4cdb0c2fadd..349b3b88a07a 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -796,3 +796,65 @@ class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
   let Inst{15-6}  = funct;
   let Inst{5-0}   = 0x3c;
 }
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index cabaa53b2b1b..8c744d8924bb 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+def brtarget26_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Encodings
@@ -125,6 +132,26 @@ class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
 class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
 class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
 class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>;
+class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+                                                       0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+                                                       0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+                                                       0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+                                                       0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>;
+class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
 
 class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
 class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
@@ -238,11 +265,11 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd>
   bit isBarrier = 1;
 }
 
-class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> {
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
-class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>;
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>;
 
 class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
                                        !strconcat("bc16", "\t$offset"), [],
@@ -717,6 +744,33 @@ class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd,
                                                   FGR32Opnd, II_TRUNC>;
 class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd,
                                                   AFGR64Opnd, II_TRUNC>;
+class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd,
+                                                 FGR32Opnd, II_ROUND>;
+class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd,
+                                                 II_ROUND>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
 
 class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO>
     : Store<opstr, RO>, MMR6Arch<opstr> {
@@ -1114,6 +1168,34 @@ def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
                   ISA_MICROMIPS32R6;
 def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
                  ISA_MICROMIPS32R6;
+def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index 65c8303f25fe..f24f80282b5e 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -141,3 +141,74 @@ class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
   let Inst{13-6}  = funct;
   let Inst{5-0}   = 0b111100;
 }
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+  bits<5> index;
+  bits<5> base;
+  bits<5> rd;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<7> mask;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-14} = mask;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<8> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-13} = imm;
+  let Inst{12}    = 0;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index b2e5ec61c8b4..9b4fb6853180 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -120,6 +120,35 @@ class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
 class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
 class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
 class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+    : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
 
 // Instruction desc.
 class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
@@ -241,6 +270,7 @@ class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
   InstrItinClass Itinerary = itin;
 }
+
 class EXTP_MM_DESC
     : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
       Uses<[DSPPos]>, Defs<[DSPEFI]>;
@@ -278,7 +308,52 @@ class EXTRV_S_H_MM_DESC
     : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
       Defs<[DSPOutFlag23]>;
 
-// Instruction defs.
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                        InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rs);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+                                       NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+                                       NoItinerary>;
+
+class RADDU_W_QB_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins DSPROpnd:$rs);
+  string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+  dag OutOperandList = (outs DSPROpnd:$rt);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+  list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+
 // microMIPS DSP Rev 1
 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
 def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
@@ -354,6 +429,28 @@ def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
 def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
 def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
 def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
 // microMIPS DSP Rev 2
 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
                      ISA_DSPR2;
@@ -398,3 +495,10 @@ def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
 def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
 def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
 def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+                       ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+                          PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+                            PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 120a841c3d9d..756e6c92c1d1 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -43,7 +43,7 @@ def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
               BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
 def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                    ROUND_W_FM_MM<0, 0x24>;
-def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                    ROUND_W_FM_MM<0, 0xec>;
 
 def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
@@ -52,7 +52,7 @@ def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
                  ROUND_W_FM_MM<1, 0x24>;
 def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
                  ROUND_W_FM_MM<1, 0x2c>;
-def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
                  ROUND_W_FM_MM<1, 0xec>;
 def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
                  ROUND_W_FM_MM<1, 0xac>;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 9dd4d1e034e9..c36a45acbf79 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -687,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
 def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
 def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
 def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
 def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
@@ -707,14 +709,14 @@ def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
   def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAXA_D  : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAXA_S  : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAX_D   : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAX_S   : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MINA_D  : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MINA_S  : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MIN_D   : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MIN_S   : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
 }
 def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
 def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
@@ -728,21 +730,27 @@ def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
 def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
-def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
 def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
 }
 def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
 def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_D    : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_S    : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 9b4b9d178183..f696a38ac0f0 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -275,6 +275,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -295,6 +296,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -306,6 +308,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -326,6 +329,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
   list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -358,6 +362,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -380,6 +385,7 @@ class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                         (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -425,6 +431,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -434,6 +441,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -485,6 +493,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -492,6 +501,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
   dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -1102,13 +1112,13 @@ def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
 def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
 def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
 def MODSUB : MODSUB_ENC, MODSUB_DESC;
-def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
 def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
 def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
-def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
-def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
-def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
-def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
 def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
 def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
 def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
@@ -1141,14 +1151,14 @@ def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
 def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
 def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
 def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
-def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
-def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
-def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
-def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
-def MFHI_DSP : MFHI_ENC, MFHI_DESC;
-def MFLO_DSP : MFLO_ENC, MFLO_DESC;
-def MTHI_DSP : MTHI_ENC, MTHI_DESC;
-def MTLO_DSP : MTLO_ENC, MTLO_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
 def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
 def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
 def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
@@ -1174,15 +1184,15 @@ def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
 def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
 def BITREV : BITREV_ENC, BITREV_DESC;
 def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
-def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
-def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
-def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
-def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
 def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
 def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
-def LWX : LWX_ENC, LWX_DESC;
-def LHX : LHX_ENC, LHX_DESC;
-def LBUX : LBUX_ENC, LBUX_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
 def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
 def INSV : DspMMRel, INSV_ENC, INSV_DESC;
 def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
@@ -1199,8 +1209,8 @@ def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
 def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
 def SHILO : SHILO_ENC, SHILO_DESC;
 def SHILOV : SHILOV_ENC, SHILOV_DESC;
-def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
-def RDDSP : RDDSP_ENC, RDDSP_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
 def WRDSP : WRDSP_ENC, WRDSP_DESC;
 
 // MIPS DSP Rev 2
@@ -1240,9 +1250,9 @@ def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC;
 def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
 def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
 def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
-def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
-def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
-def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
 def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC;
 def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC;
 def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC;
@@ -1251,7 +1261,7 @@ def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC;
 def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC;
 def APPEND : APPEND_ENC, APPEND_DESC;
 def BALIGN : BALIGN_ENC, BALIGN_DESC;
-def PREPEND : PREPEND_ENC, PREPEND_DESC;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC;
 
 }
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index d1a724944335..377260f89d10 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
   def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
-  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+  def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -267,31 +267,29 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                  ABSS_FM<0xc, 16>, ISA_MIPS2;
-let AdditionalPredicates = [NotInMicroMips] in {
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
 def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
                  ABSS_FM<0xd, 16>, ISA_MIPS2;
 def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                  ABSS_FM<0xe, 16>, ISA_MIPS2;
 def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
                  ABSS_FM<0xf, 16>, ISA_MIPS2;
-}
 def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
 defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
 defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
 defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
 defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
 let DecoderNamespace = "Mips64" in {
+  let AdditionalPredicates = [NotInMicroMips] in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
                   ABSS_FM<0x8, 16>, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
                     ABSS_FM<0x8, 17>, FGR_64;
-  let AdditionalPredicates = [NotInMicroMips] in {
   def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
                   ABSS_FM<0x9, 16>, FGR_64;
   def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index d09843ed0e53..e75858a181e5 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
-  MBB->removeSuccessor(TgtMBB);
-  MBB->addSuccessor(LongBrMBB);
+  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
 
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c7a3bbd3762a..174deb88bc5c 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -569,14 +569,21 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
   if ((UseAtEnd && MBB->isReturnBlock()) ||
       (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
     return true;
-    
-  RS.initRegState();
+
   RS.enterBasicBlock(MBB);
 
-  // The scratch register will be used at the end of the block, so must consider
-  // all registers used within the block
-  if (UseAtEnd && MBB->begin() != MBB->getFirstTerminator())
-    RS.forward(MBB->getFirstTerminator());
+  if (UseAtEnd && !MBB->empty()) {
+    // The scratch register will be used at the end of the block, so must consider
+    // all registers used within the block
+
+    MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+    // If no terminator, back iterator up to previous instruction.
+    if (MBBI == MBB->end())
+      MBBI = std::prev(MBBI);
+
+    if (MBBI != MBB->begin())
+      RS.forward(MBBI);
+  }
   
   if (!RS.isRegUsed(R0)) 
     return true;
@@ -1768,6 +1775,6 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 }
 
 bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
-  // FIXME: Enable this for non-Darwin PPC64 once it is confirmed working.
-  return false;
+  return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+          MF.getSubtarget<PPCSubtarget>().isPPC64());
 }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 438a2980f2ce..2261b71c5aa9 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2780,7 +2780,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-      
+
       SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
               Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
       unsigned DM[2];
@@ -3106,7 +3106,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
         if (!CurDAG->MaskedValueIsZero(Op0,
               APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
           return false;
-        
+
         LHS = Op0.getOperand(0);
         RHS = Op0.getOperand(1);
         return true;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1b1e0cf57865..176a8b3ea59b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -329,6 +329,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -477,6 +479,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
@@ -998,6 +1002,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
   case PPCISD::SRA:             return "PPCISD::SRA";
@@ -5808,6 +5813,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
+SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
+    SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const {
+  SDLoc dl(Op);
+
+  // Get the corect type for integers.
+  EVT IntVT = Op.getValueType();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNAREAOFFSET node.
+  SDValue Ops[2] = {Chain, FPSIdx};
+  SDVTList VTs = DAG.getVTList(IntVT);
+  return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                    const PPCSubtarget &Subtarget) const {
   // When we pop the dynamic allocation we need to restore the SP link.
@@ -7938,6 +7959,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
+  case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 415c47c286e3..c0aafbac1aa0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -79,6 +79,11 @@ namespace llvm {
       /// compute an allocation on the stack.
       DYNALLOC,
 
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an offset from native SP to the address  of the most recent
+      /// dynamic alloca.
+      DYNAREAOFFSET,
+
       /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
       /// at function entry, used for PIC code.
       GlobalBaseReg,
@@ -728,6 +733,8 @@ namespace llvm {
                         const PPCSubtarget &Subtarget) const;
     SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                 const PPCSubtarget &Subtarget) const;
+    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index d62833037db5..075e093e41a1 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in
 def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set i64:$result,
                              (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+                       [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
 
 let Defs = [LR8] in {
 def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index cc1af1a7132f..6c4364aad331 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific transformation functions and pattern fragments.
@@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in
 def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set i32:$result,
                              (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+                       [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
                          
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6d53f876c062..934bdf622418 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -430,6 +430,27 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   MBB.erase(II);
 }
 
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+    MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  DebugLoc dl = MI.getDebugLoc();
+  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+      .addImm(maxCallFrameSize);
+  MBB.erase(II);
+}
+
 /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
 /// reserving a whole register (R0), we scrounge for one here. This generates
 /// code like this:
@@ -754,6 +775,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Get the instruction opcode.
   unsigned OpC = MI.getOpcode();
 
+  if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+    lowerDynamicAreaOffset(II);
+    return;
+  }
+
   // Special case for dynamic alloca.
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 1b1e160d836c..b15fde83c9f3 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -101,6 +101,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   }
 
   void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II,
                        unsigned FrameIndex) const;
   void lowerCRRestore(MachineBasicBlock::iterator II,
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 38bff44e7542..c689b7f7201e 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    // TODO: If we ever want to support v7, this needs to be extended
+    // to cover all floating point operations.
     if (!Subtarget->isV9() &&
         (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
          || MI->getOpcode() == SP::FCMPQ)) {
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 728910422166..110316ba57b2 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -162,9 +162,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
   SmallPtrSet<MachineBasicBlock *, 16> Visited;
   SmallVector<POStackEntry, 16> Stack;
 
-  MachineBasicBlock *Entry = &*MF.begin();
-  Visited.insert(Entry);
-  Stack.push_back(POStackEntry(Entry, MF, MLI));
+  MachineBasicBlock *EntryBlock = &*MF.begin();
+  Visited.insert(EntryBlock);
+  Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
 
   for (;;) {
     POStackEntry &Entry = Stack.back();
@@ -220,7 +220,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
 #endif
 }
 
-static int GetLoopDepth(const MachineLoop *Loop) {
+static unsigned GetLoopDepth(const MachineLoop *Loop) {
   return Loop ? Loop->getLoopDepth() : 0;
 }
 
@@ -249,12 +249,12 @@ static void PlaceBlockMarkers(MachineBasicBlock &MBB,
 
   MachineBasicBlock::iterator InsertPos;
   MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
-  int MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB));
-  int HeaderLoopDepth = GetLoopDepth(HeaderLoop);
+  unsigned MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB));
+  unsigned HeaderLoopDepth = GetLoopDepth(HeaderLoop);
   if (HeaderLoopDepth > MBBLoopDepth) {
     // The nearest common dominating point is more deeply nested. Insert the
     // BLOCK just above the LOOP.
-    for (int i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i)
+    for (unsigned i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i)
       HeaderLoop = HeaderLoop->getParentLoop();
     Header = HeaderLoop->getHeader();
     InsertPos = Header->begin();
@@ -341,7 +341,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   PlaceMarkers(MF, MLI, TII, MDT);
 
 #ifndef NDEBUG
-  // Verify that block and loop beginnings and endings are in FIFO order, and
+  // Verify that block and loop beginnings and endings are in LIFO order, and
   // that all references to blocks are to blocks on the stack at the point of
   // the reference.
   SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 8f1a06d46305..2485df1ab5d2 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -353,12 +353,10 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
 
   // Record the number and types of the return values.
   for (const ISD::OutputArg &Out : Outs) {
-    if (Out.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval results");
+    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+    assert(!Out.Flags.isNest() && "nest is not valid for return values");
     if (Out.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
-    if (Out.Flags.isNest())
-      fail(DL, DAG, "WebAssembly hasn't implemented nest results");
     if (Out.Flags.isInConsecutiveRegs())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
     if (Out.Flags.isInConsecutiveRegsLast())
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 4760f0d576e4..62c5f33cfad7 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -37,7 +37,7 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   /// determined or made to meet the stack requirements:
   ///   - single use (per path)
   ///   - single def (per path)
-  ///   - defined and used in FIFO order with other stack registers
+  ///   - defined and used in LIFO order with other stack registers
   BitVector VRegStackified;
 
 public:
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index c3847dd9fcb4..bdccc8577c5e 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -117,7 +117,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         break;
 
       // Iterate through the inputs in reverse order, since we'll be pulling
-      // operands off the stack in FIFO order.
+      // operands off the stack in LIFO order.
       bool AnyStackified = false;
       for (MachineOperand &Op : reverse(Insert->uses())) {
         // We're only interested in explicit virtual register operands.
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 4057ff7a9b43..80a83fa76b57 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -40,7 +40,7 @@ def F32_0 : WebAssemblyReg<"%f32.0">;
 def F64_0 : WebAssemblyReg<"%f64.0">;
 
 // The expression stack "register". This is an opaque entity which serves to
-// order uses and defs that must remain in FIFO order.
+// order uses and defs that must remain in LIFO order.
 def EXPR_STACK : WebAssemblyReg<"STACK">;
 
 // The incoming arguments "register". This is an opaque entity which serves to
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index d0735b84de60..3a7f50e3b142 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -69,7 +69,8 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
 
-  for (auto &MBB : MF)
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
     for (auto &MI : MBB)
       switch (MI.getOpcode()) {
       default:
@@ -94,9 +95,12 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
                         ->getFirstTerminator();
           if (&MI == Where || !MDT.dominates(&MI, Where))
             continue;
+          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where <<
+                " from " << MI <<"\n");
           O.setReg(ToReg);
         }
       }
+  }
 
   return true;
 }
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 6015d3465de3..83a62b731b54 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,7 +19,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f4f7f0cf33b2..682f75c7f51c 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -18,7 +18,7 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -204,8 +204,12 @@ static bool isEAXLiveIn(MachineFunction &MF) {
   return false;
 }
 
-/// Check whether or not the terminators of \p MBB needs to read EFLAGS.
-static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
   for (const MachineInstr &MI : MBB.terminators()) {
     bool BreakNext = false;
     for (const MachineOperand &MO : MI.operands()) {
@@ -215,15 +219,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
       if (Reg != X86::EFLAGS)
         continue;
 
-      // This terminator needs an eflag that is not defined
-      // by a previous terminator.
+      // This terminator needs an eflags that is not defined
+      // by a previous another terminator:
+      // EFLAGS is live-in of the region composed by the terminators.
       if (!MO.isDef())
         return true;
+      // This terminator defines the eflags, i.e., we don't need to preserve it.
+      // However, we still need to check this specific terminator does not
+      // read a live-in value.
       BreakNext = true;
     }
+    // We found a definition of the eflags, no need to preserve them.
     if (BreakNext)
-      break;
+      return false;
   }
+
+  // None of the terminators use or define the eflags.
+  // Check if they are live-out, that would imply we need to preserve them.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
+      return true;
+
   return false;
 }
 
@@ -306,7 +322,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   // is tricky.
   bool UseLEA;
   if (!InEpilogue) {
-    UseLEA = STI.useLeaForSP();
+    // Check if inserting the prologue at the beginning
+    // of MBB would require to use LEA operations.
+    // We need to use LEA operations if EFLAGS is live in, because
+    // it means an instruction will read it before it gets defined.
+    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
   } else {
     // If we can use LEA for SP but we shouldn't, check that none
     // of the terminators uses the eflags. Otherwise we will insert
@@ -315,10 +335,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
     // and is an optimization anyway.
     UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
     if (UseLEA && !STI.useLeaForSP())
-      UseLEA = terminatorsNeedFlagsAsInput(MBB);
+      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
     // If that assert breaks, that means we do not do the right thing
     // in canUseAsEpilogue.
-    assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) &&
+    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
            "We shouldn't have allowed this insertion point");
   }
 
@@ -2566,10 +2586,10 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
     return true;
 
   // If we cannot use LEA to adjust SP, we may need to use ADD, which
-  // clobbers the EFLAGS. Check that none of the terminators reads the
-  // EFLAGS, and if one uses it, conservatively assume this is not
+  // clobbers the EFLAGS. Check that we do not need to preserve it,
+  // otherwise, conservatively assume this is not
   // safe to insert the epilogue here.
-  return !terminatorsNeedFlagsAsInput(MBB);
+  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
 }
 
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fa8c9cb136f5..17573733b3ec 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26,7 +26,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1145,7 +1145,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
 
-    if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
+    if (Subtarget->hasAnyFMA()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
@@ -4301,7 +4301,7 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
 }
 
 /// Returns a vector of specified type with all zero elements.
-static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -6428,7 +6428,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
     for (unsigned i = 0; i < 4; ++i) {
-      bool isZero = !(NonZeros & (1 << i));
+      bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
@@ -16525,6 +16525,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                   DAG.getConstant(X86CC, dl, MVT::i8), Cond);
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue CC = Op.getOperand(3);
+      SDValue Sae = Op.getOperand(4);
+      auto ComiType = TranslateX86ConstCondToX86CC(CC);
+      // choose between ordered and unordered (comi/ucomi)
+      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+      SDValue Cond;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+      else
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), DAG);
@@ -16800,7 +16818,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
-    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -20415,7 +20433,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
+  if (!Subtarget->hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
@@ -22480,7 +22498,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
@@ -23363,7 +23381,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
       return AddSub;
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
@@ -24838,7 +24856,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
       // the element size. The constant shift amount will be
       // encoded as a 8-bit immediate.
       if (ShiftAmt.trunc(8).uge(MaxAmount))
-        return getZeroVector(VT, Subtarget, DAG, DL);
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
     }
 
   return SDValue();
@@ -25755,8 +25773,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
-      ShuffleVec[i] = NumElems*SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    &ShuffleVec[0]);
@@ -25837,8 +25855,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
          "WideVecVT should be legal");
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                        DAG.getUNDEF(WideVecVT),
-                                        &ShuffleVec[0]);
+                                              DAG.getUNDEF(WideVecVT),
+                                              &ShuffleVec[0]);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
@@ -25870,8 +25888,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
-  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
-                            NewMask, StVT, Mst->getMemOperand(), false);
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
 }
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -26299,24 +26318,40 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
   SDValue Arg = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available. 
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                       Arg.getOperand(1), Zero);
+  }
 
   // If we're negating a FMA node, then we can adjust the
   // instruction to include the extra negation.
   if (Arg.hasOneUse()) {
     switch (Arg.getOpcode()) {
-      case X86ISD::FMADD:
-        return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FMSUB:
-        return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMADD:
-        return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMSUB:
-        return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMADD:
+      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMSUB:
+      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMADD:
+      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMSUB:
+      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
     }
   }
   return SDValue();
@@ -26631,9 +26666,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
-      (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
-       !Subtarget->hasAVX512()))
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -26732,18 +26765,18 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                   LHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                   RHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
 
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 16b1f3b59b0d..d15d0dc96e6f 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5605,6 +5605,29 @@ let Predicates = [HasAVX512] in {
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
+
+//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+                            string OpcodeStr> {
+  def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+                 !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, 
+                                                        (i32 FROUND_NO_EXC)))],
+                 IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+                 Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
                                  "ucomiss">, PS, EVEX, VEX_LIG,
@@ -5911,12 +5934,12 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          EVEX_B, EVEX_RC;
 
   let isCodeGenOnly = 1 in {
-    def r : SI<opc, MRMSrcReg, (outs _.FRC:$dst),
+    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _.FRC:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
 
     let mayLoad = 1 in
-      def m : SI<opc, MRMSrcMem, (outs _.FRC:$dst),
+      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
   }
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 51648c6c567e..03ae21125b0e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
 // The FopST0 series are not included here because of the irregularities
 // in where the 'r' goes in assembly output.
 // These instructions cannot address 80-bit memory.
-multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+                    bit Forward = 1> {
 // ST(0) = ST(0) + [mem]
 def _Fp32m  : FpIf32<(outs RFP32:$dst),
                      (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP32:$dst,
-                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP32:$dst,
+                        (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+                       (set RFP32:$dst,
+                        (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
 def _Fp64m  : FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+                       (set RFP64:$dst,
+                        (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
 def _Fp64m32: FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+                       (set RFP64:$dst,
+                        (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
 def _Fp80m32: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
 def _Fp80m64: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
-                 !strconcat("f", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
-                 !strconcat("f", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{l}\t$src")>;
 // ST(0) = ST(0) + [memint]
 def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
 def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
 def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
 def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
 def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
 def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
-                  !strconcat("fi", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
-                  !strconcat("fi", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{l}\t$src")>;
 }
 
 let Defs = [FPSW] in {
@@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>;
 let SchedRW = [WriteFAddLd] in {
 defm ADD : FPBinary<fadd, MRM0m, "add">;
 defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
 }
 let SchedRW = [WriteFMulLd] in {
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
 }
 let SchedRW = [WriteFDivLd] in {
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
 }
 }
 
@@ -306,7 +336,7 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
 
 def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
 def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
-def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
 
 def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
 def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 25f247e9d620..b456460a5bb5 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,6 +38,8 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
+                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
@@ -66,7 +68,9 @@ def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI",      SDTX86CmpTestSae>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI",     SDTX86CmpTestSae>;
 def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 //def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index aaeef465bf50..12da3a9319e6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3517,23 +3517,23 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
   bool IsIntrinOpcode;
   isFMA3(Opc, &IsIntrinOpcode);
 
-  unsigned GroupsNum;
+  size_t GroupsNum;
   const unsigned (*OpcodeGroups)[3];
   if (IsIntrinOpcode) {
-    GroupsNum = sizeof(IntrinOpcodeGroups) / sizeof(IntrinOpcodeGroups[0]);
+    GroupsNum = array_lengthof(IntrinOpcodeGroups);
     OpcodeGroups = IntrinOpcodeGroups;
   } else {
-    GroupsNum = sizeof(RegularOpcodeGroups) / sizeof(RegularOpcodeGroups[0]);
+    GroupsNum = array_lengthof(RegularOpcodeGroups);
     OpcodeGroups = RegularOpcodeGroups;
   }
 
   const unsigned *FoundOpcodesGroup = nullptr;
-  unsigned FormIndex;
+  size_t FormIndex;
 
   // Look for the input opcode in the corresponding opcodes table.
-  unsigned GroupIndex = 0;
-  for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) {
-    for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) {
+  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+         ++GroupIndex) {
+    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
       if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
         FoundOpcodesGroup = OpcodeGroups[GroupIndex];
         break;
@@ -6715,16 +6715,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
 // domains, but they require a bit more work than just switching opcodes.
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
-    if (ReplaceableInstrs[i][domain-1] == opcode)
-      return ReplaceableInstrs[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
-    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
-      return ReplaceableInstrsAVX2[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 80e55d04c1f3..bb2f7248b0e9 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -20,7 +20,7 @@ enum IntrinsicType {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
-  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
   INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
   INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
@@ -1630,6 +1630,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -1821,6 +1823,95 @@ static void verifyIntrinsicTables() {
          "Intrinsic data tables should have unique entries");
 }
 
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _X86_CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _X86_CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _X86_CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _X86_CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _X86_CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _X86_CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _X86_CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _X86_CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _X86_CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _X86_CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _X86_CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _X86_CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _X86_CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _X86_CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _X86_CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _X86_CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _X86_CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _X86_CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _X86_CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _X86_CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _X86_CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _X86_CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _X86_CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _X86_CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+  ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+  unsigned IntImm = CImm->getZExtValue();
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (IntImm) {
+  default: llvm_unreachable("Invalid floating point compare value for Comi!");
+  case _X86_CMP_EQ_OQ:      // 0x00 - Equal (ordered, nonsignaling)
+  case _X86_CMP_EQ_OS:      // 0x10 - Equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_E);
+  case _X86_CMP_EQ_UQ:      // 0x08 - Equal (unordered, non-signaling)
+  case _X86_CMP_EQ_US:      // 0x18 - Equal (unordered, signaling)
+    return std::make_tuple(false , X86::COND_E);
+  case _X86_CMP_LT_OS:      // 0x01 - Less-than (ordered, signaling)
+  case _X86_CMP_LT_OQ:      // 0x11 - Less-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_B);
+  case _X86_CMP_NGE_US:     // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+  case _X86_CMP_NGE_UQ:     // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false , X86::COND_B);
+  case _X86_CMP_LE_OS:      // 0x02 - Less-than-or-equal (ordered, signaling)
+  case _X86_CMP_LE_OQ:      // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_BE);
+  case _X86_CMP_NGT_US:     // 0x0A - Not-greater-than (unordered, signaling)
+  case _X86_CMP_NGT_UQ:     // 0x1A - Not-greater-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_BE);
+  case _X86_CMP_GT_OS:      // 0x0E - Greater-than (ordered, signaling)
+  case _X86_CMP_GT_OQ:      // 0x1E - Greater-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_A);
+  case _X86_CMP_NLE_US:     // 0x06 - Not-less-than-or-equal (unordered,signaling)
+  case _X86_CMP_NLE_UQ:     // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_A);
+  case _X86_CMP_GE_OS:      // 0x0D - Greater-than-or-equal (ordered, signaling)
+  case _X86_CMP_GE_OQ:      // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_AE);
+  case _X86_CMP_NLT_US:     // 0x05 - Not-less-than (unordered, signaling)
+  case _X86_CMP_NLT_UQ:     // 0x15 - Not-less-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_AE);
+  case _X86_CMP_NEQ_OQ:     // 0x0C - Not-equal (ordered, non-signaling)
+  case _X86_CMP_NEQ_OS:     // 0x1C - Not-equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_NE);
+  case _X86_CMP_NEQ_UQ:     // 0x04 - Not-equal (unordered, nonsignaling)
+  case _X86_CMP_NEQ_US:     // 0x14 - Not-equal (unordered, signaling)
+    return std::make_tuple(false, X86::COND_NE);
+  }
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 12f38c7946a8..ceeb57d0cc4c 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -375,7 +375,7 @@ def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
                                                      R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
-                                                      R8, R9, R10, R11)>;
+                                                      R8, R9, R10, R11, RIP)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 5d6088439578..fd896c2857f6 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -354,9 +354,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool hasXSAVEC() const { return HasXSAVEC; }
   bool hasXSAVES() const { return HasXSAVES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
-  bool hasFMA() const { return HasFMA; }
-  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
-  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+  // Prefer FMA4 to FMA - its better for commutation/memory folding and
+  // has equal or better performance on all supported targets.
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA4() const { return HasFMA4; }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c648750b202e..cf7a826ea85d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -528,17 +528,31 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const TypeConversionCostTblEntry AVX512ConversionTbl[] = {
+  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
-    { ISD::FP_ROUND,  MVT::v16f32,  MVT::v8f64,  3 },
 
     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
-    { ISD::TRUNCATE,  MVT::v16i32,  MVT::v8i64,  4 },
 
     // v16i1 -> v16i32 - load + broadcast
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
@@ -548,16 +562,46 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
 
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
   };
 
   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -693,12 +737,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
       return LTSrc.first * Entry->Cost;
   }
 
-  if (ST->hasAVX512()) {
-    if (const auto *Entry = ConvertCostTableLookup(AVX512ConversionTbl, ISD,
-                                                   LTDest.second, LTSrc.second))
-      return Entry->Cost;
-  }
-
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -706,6 +744,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
   if (ST->hasAVX2()) {
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 8a5aa40bc7f2..0276f3969b48 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index ab0f7114957b..c2359a8a172e 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -51,7 +51,7 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
 }
 
 // Get a Module for \p FileName from the cache, or load it lazily.
-Module &FunctionImporter::getOrLoadModule(StringRef FileName) {
+Module &ModuleLazyLoaderCache::operator()(StringRef FileName) {
   auto &Module = ModuleMap[FileName];
   if (!Module)
     Module = loadFile(FileName, Context);
@@ -66,7 +66,6 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (isa<CallInst>(I)) {
-        DEBUG(dbgs() << "Found a call: '" << I << "'\n");
         auto CalledFunction = cast<CallInst>(I).getCalledFunction();
         // Insert any new external calls that have not already been
         // added to set/worklist.
@@ -81,29 +80,14 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
   }
 }
 
-// Automatically import functions in Module \p M based on the summaries index.
-//
-// The current implementation imports every called functions that exists in the
-// summaries index.
-bool FunctionImporter::importFunctions(Module &M) {
-  assert(&Context == &M.getContext());
-
-  bool Changed = false;
-
-  /// First step is collecting the called external functions.
-  StringSet<> CalledFunctions;
-  SmallVector<StringRef, 64> Worklist;
-  for (auto &F : M) {
-    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
-      continue;
-    findExternalCalls(F, CalledFunctions, Worklist);
-  }
-
-  /// Second step: for every call to an external function, try to import it.
-
-  // Linker that will be used for importing function
-  Linker L(&M, DiagnosticHandler);
-
+// Helper function: given a worklist and an index, will process all the worklist
+// and import them based on the summary information
+static unsigned ProcessImportWorklist(
+    Module &DestModule, SmallVector<StringRef, 64> &Worklist,
+    StringSet<> &CalledFunctions, Linker &TheLinker,
+    const FunctionInfoIndex &Index,
+    std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
+  unsigned ImportCount = 0;
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
     DEBUG(dbgs() << "Process import for " << CalledFunctionName << "\n");
@@ -124,35 +108,32 @@ bool FunctionImporter::importFunctions(Module &M) {
     auto *Summary = Info->functionSummary();
     if (!Summary) {
       // FIXME: in case we are lazyloading summaries, we can do it now.
-      dbgs() << "Missing summary for  " << CalledFunctionName
-             << ", error at import?\n";
+      DEBUG(dbgs() << "Missing summary for  " << CalledFunctionName
+                   << ", error at import?\n");
       llvm_unreachable("Missing summary");
     }
 
     if (Summary->instCount() > ImportInstrLimit) {
-      dbgs() << "Skip import of " << CalledFunctionName << " with "
-             << Summary->instCount() << " instructions (limit "
-             << ImportInstrLimit << ")\n";
+      DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with "
+                   << Summary->instCount() << " instructions (limit "
+                   << ImportInstrLimit << ")\n");
       continue;
     }
 
-    //
-    // No profitability notion right now, just import all the time...
-    //
-
     // Get the module path from the summary.
     auto FileName = Summary->modulePath();
     DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName
                  << "\n");
 
     // Get the module for the import (potentially from the cache).
-    auto &Module = getOrLoadModule(FileName);
+    auto &Module = LazyModuleLoader(FileName);
+    assert(&Module.getContext() == &DestModule.getContext());
 
     // The function that we will import!
     GlobalValue *SGV = Module.getNamedValue(CalledFunctionName);
     StringRef ImportFunctionName = CalledFunctionName;
     if (!SGV) {
-      // Might be local in source Module, promoted/renamed in dest Module M.
+      // Might be local in source Module, promoted/renamed in DestModule.
       std::pair<StringRef, StringRef> Split =
           CalledFunctionName.split(".llvm.");
       SGV = Module.getNamedValue(Split.first);
@@ -186,19 +167,55 @@ bool FunctionImporter::importFunctions(Module &M) {
     }
 
     // Link in the specified function.
-    if (L.linkInModule(&Module, Linker::Flags::None, &Index, F))
+    DenseSet<const GlobalValue *> FunctionsToImport;
+    FunctionsToImport.insert(F);
+    if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index,
+                               &FunctionsToImport))
       report_fatal_error("Function Import: link error");
 
     // Process the newly imported function and add callees to the worklist.
-    GlobalValue *NewGV = M.getNamedValue(ImportFunctionName);
+    GlobalValue *NewGV = DestModule.getNamedValue(ImportFunctionName);
     assert(NewGV);
     Function *NewF = dyn_cast<Function>(NewGV);
     assert(NewF);
     findExternalCalls(*NewF, CalledFunctions, Worklist);
+    ++ImportCount;
+  }
+  return ImportCount;
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+//
+// The current implementation imports every called functions that exists in the
+// summaries index.
+bool FunctionImporter::importFunctions(Module &DestModule) {
+  DEBUG(errs() << "Starting import for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  unsigned ImportedCount = 0;
 
-    Changed = true;
+  /// First step is collecting the called external functions.
+  StringSet<> CalledFunctions;
+  SmallVector<StringRef, 64> Worklist;
+  for (auto &F : DestModule) {
+    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+    findExternalCalls(F, CalledFunctions, Worklist);
   }
-  return Changed;
+  if (Worklist.empty())
+    return false;
+
+  /// Second step: for every call to an external function, try to import it.
+
+  // Linker that will be used for importing function
+  Linker TheLinker(DestModule, DiagnosticHandler);
+
+  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions,
+                                         TheLinker, Index, getLazyModule);
+
+  DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  return ImportedCount;
 }
 
 /// Summary file to use for function importing when using -function-import from
@@ -259,7 +276,10 @@ class FunctionImportPass : public ModulePass {
     }
 
     // Perform the import now.
-    FunctionImporter Importer(M.getContext(), *Index, diagnosticHandler);
+    ModuleLazyLoaderCache Loader(M.getContext());
+    FunctionImporter Importer(*Index, diagnosticHandler,
+                              [&](StringRef Name)
+                                  -> Module &{ return Loader(Name); });
     return Importer.importFunctions(M);
 
     return false;
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index a0b740563693..714e1d6e42d2 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f72089e6c8ef..2bf6faa47b93 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1962,14 +1962,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     case ICmpInst::ICMP_EQ:
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
         // if LHSCst and RHSCst differ only by one bit:
-        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
+        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
         assert(LHSCst->getValue().ule(LHSCst->getValue()));
 
         APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
         if (Xor.isPowerOf2()) {
-          Value *NegCst = Builder->getInt(~Xor);
-          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst);
+          Value *Cst = Builder->getInt(Xor);
+          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
         }
       }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 26088bbe018a..d2341c8c05db 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1942,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // Mark any parameters that are known to be non-null with the nonnull
   // attribute.  This is helpful for inlining calls to functions with null
   // checks on their arguments.
+  SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
+
   for (Value *V : CS.args()) {
     if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) &&
-        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) {
-      AttributeSet AS = CS.getAttributes();
-      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1,
-                           Attribute::NonNull);
-      CS.setAttributes(AS);
-      Changed = true;
-    }
+        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI))
+      Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
+
   assert(ArgNo == CS.arg_size() && "sanity check");
 
+  if (!Indices.empty()) {
+    AttributeSet AS = CS.getAttributes();
+    LLVMContext &Ctx = CS.getInstruction()->getContext();
+    AS = AS.addAttribute(Ctx, Indices,
+                         Attribute::get(Ctx, Attribute::NonNull));
+    CS.setAttributes(AS);
+    Changed = true;
+  }
+
   // If the callee is a pointer to a function, attempt to move any casts to the
   // arguments of the call/invoke.
   Value *Callee = CS.getCalledValue();
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index e95a65510ecc..74c5148f9f89 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -42,9 +42,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp
index 6071ca5a8754..4441663fc6de 100644
--- a/lib/Transforms/Instrumentation/SafeStack.cpp
+++ b/lib/Transforms/Instrumentation/SafeStack.cpp
@@ -57,6 +57,7 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions,
 STATISTIC(NumAllocas, "Total number of allocas");
 STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
 STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
 } // namespace llvm
@@ -68,14 +69,14 @@ namespace {
 ///
 /// The implementation simply replaces all mentions of the alloca with zero.
 class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
-  const AllocaInst *AI;
+  const Value *AllocaPtr;
 
 public:
-  AllocaOffsetRewriter(ScalarEvolution &SE, const AllocaInst *AI)
-      : SCEVRewriteVisitor(SE), AI(AI) {}
+  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
+      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (Expr->getValue() == AI)
+    if (Expr->getValue() == AllocaPtr)
       return SE.getZero(Expr->getType());
     return Expr;
   }
@@ -115,9 +116,14 @@ class SafeStack : public FunctionPass {
   /// given function and append them to the respective vectors.
   void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
                  SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<Argument *> &ByValArguments,
                  SmallVectorImpl<ReturnInst *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
+  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the
+  /// size can not be statically determined.
+  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
+
   /// \brief Allocate space for all static allocas in \p StaticAllocas,
   /// replace allocas with pointers into the unsafe stack and generate code to
   /// restore the stack pointer before all return instructions in \p Returns.
@@ -126,6 +132,7 @@ class SafeStack : public FunctionPass {
   /// allocas are allocated.
   Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
                                         ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<Argument *> ByValArguments,
                                         ArrayRef<ReturnInst *> Returns);
 
   /// \brief Generate code to restore the stack after all stack restore points
@@ -145,11 +152,12 @@ class SafeStack : public FunctionPass {
                                        AllocaInst *DynamicTop,
                                        ArrayRef<AllocaInst *> DynamicAllocas);
 
-  bool IsSafeStackAlloca(const AllocaInst *AI);
+  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize);
 
   bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
-                          const AllocaInst *AI);
-  bool IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI);
+                          const Value *AllocaPtr, uint64_t AllocaSize);
+  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
+                    uint64_t AllocaSize);
 
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -177,21 +185,34 @@ class SafeStack : public FunctionPass {
   bool runOnFunction(Function &F) override;
 }; // class SafeStack
 
-bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
-  AllocaOffsetRewriter Rewriter(*SE, AI);
+uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
+  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
+                             const Value *AllocaPtr, uint64_t AllocaSize) {
+  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
   const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
 
   uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
   ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
   ConstantRange SizeRange =
-      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size));
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
-  ConstantRange AllocaRange = ConstantRange(
-      APInt(BitWidth, 0),
-      APInt(BitWidth, DL->getTypeStoreSize(AI->getAllocatedType())));
+  ConstantRange AllocaRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
   bool Safe = AllocaRange.contains(AccessRange);
 
-  DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n"
+  DEBUG(dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
                << "            Access " << *Addr << "\n"
                << "            SCEV " << *Expr
                << " U: " << SE->getUnsignedRange(Expr)
@@ -204,36 +225,38 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
 }
 
 bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
-                                   const AllocaInst *AI) {
+                                   const Value *AllocaPtr,
+                                   uint64_t AllocaSize) {
   // All MemIntrinsics have destination address in Arg0 and size in Arg2.
   if (MI->getRawDest() != U) return true;
   const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
   // Non-constant size => unsafe. FIXME: try SCEV getRange.
   if (!Len) return false;
-  return IsAccessSafe(U, Len->getZExtValue(), AI);
+  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize);
 }
 
-/// Check whether a given alloca instruction (AI) should be put on the safe
+/// Check whether a given allocation must be put on the safe
 /// stack or not. The function analyzes all uses of AI and checks whether it is
 /// only accessed in a memory safe way (as decided statically).
-bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
+bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
   // Go through all uses of this alloca and check whether all accesses to the
   // allocated object are statically known to be memory safe and, hence, the
   // object can be placed on the safe stack.
   SmallPtrSet<const Value *, 16> Visited;
-  SmallVector<const Instruction *, 8> WorkList;
-  WorkList.push_back(AI);
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(AllocaPtr);
 
   // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
   while (!WorkList.empty()) {
-    const Instruction *V = WorkList.pop_back_val();
+    const Value *V = WorkList.pop_back_val();
     for (const Use &UI : V->uses()) {
       auto I = cast<const Instruction>(UI.getUser());
       assert(V == UI.get());
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AI))
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+                          AllocaSize))
           return false;
         break;
       }
@@ -243,13 +266,13 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
       case Instruction::Store: {
         if (V == I->getOperand(0)) {
           // Stored the pointer - conservatively assume it may be unsafe.
-          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                        << "\n            store of address: " << *I << "\n");
           return false;
         }
 
-        if (!IsAccessSafe(
-                UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), AI))
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+                          AllocaPtr, AllocaSize))
           return false;
         break;
       }
@@ -269,8 +292,8 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
         }
 
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-          if (!IsMemIntrinsicSafe(MI, UI, AI)) {
-            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
+            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                          << "\n            unsafe memintrinsic: " << *I
                          << "\n");
             return false;
@@ -288,9 +311,9 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
         ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
         for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
           if (A->get() == V)
-            if (!(CS.doesNotCapture(A - B) &&
-                  (CS.doesNotAccessMemory(A - B) || CS.doesNotAccessMemory()))) {
-              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
+                                               CS.doesNotAccessMemory()))) {
+              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                            << "\n            unsafe call: " << *I << "\n");
               return false;
             }
@@ -341,13 +364,15 @@ Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) {
 void SafeStack::findInsts(Function &F,
                           SmallVectorImpl<AllocaInst *> &StaticAllocas,
                           SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<Argument *> &ByValArguments,
                           SmallVectorImpl<ReturnInst *> &Returns,
                           SmallVectorImpl<Instruction *> &StackRestorePoints) {
   for (Instruction &I : instructions(&F)) {
     if (auto AI = dyn_cast<AllocaInst>(&I)) {
       ++NumAllocas;
 
-      if (IsSafeStackAlloca(AI))
+      uint64_t Size = getStaticAllocaAllocationSize(AI);
+      if (IsSafeStackAlloca(AI, Size))
         continue;
 
       if (AI->isStaticAlloca()) {
@@ -372,6 +397,17 @@ void SafeStack::findInsts(Function &F,
             "gcroot intrinsic not compatible with safestack attribute");
     }
   }
+  for (Argument &Arg : F.args()) {
+    if (!Arg.hasByValAttr())
+      continue;
+    uint64_t Size =
+        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+    if (IsSafeStackAlloca(&Arg, Size))
+      continue;
+
+    ++NumUnsafeByValArguments;
+    ByValArguments.push_back(&Arg);
+  }
 }
 
 AllocaInst *
@@ -406,7 +442,7 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   for (Instruction *I : StackRestorePoints) {
     ++NumUnsafeStackRestorePoints;
 
-    IRB.SetInsertPoint(cast<Instruction>(I->getNextNode()));
+    IRB.SetInsertPoint(I->getNextNode());
     Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;
     IRB.CreateStore(CurrentTop, UnsafeStackPtr);
   }
@@ -414,11 +450,10 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   return DynamicTop;
 }
 
-Value *
-SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
-                                          ArrayRef<AllocaInst *> StaticAllocas,
-                                          ArrayRef<ReturnInst *> Returns) {
-  if (StaticAllocas.empty())
+Value *SafeStack::moveStaticAllocasToUnsafeStack(
+    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
+    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) {
+  if (StaticAllocas.empty() && ByValArguments.empty())
     return nullptr;
 
   DIBuilder DIB(*F.getParent());
@@ -440,6 +475,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
 
   // Compute maximum alignment among static objects on the unsafe stack.
   unsigned MaxAlignment = 0;
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+    if (Align > MaxAlignment)
+      MaxAlignment = Align;
+  }
   for (AllocaInst *AI : StaticAllocas) {
     Type *Ty = AI->getAllocatedType();
     unsigned Align =
@@ -451,22 +493,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
   if (MaxAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
     assert(isPowerOf2_32(MaxAlignment));
-    IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+    IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
         IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
                       ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),
         StackPtrTy));
   }
 
-  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   int64_t StaticOffset = 0; // Current stack top.
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+
+    // Add alignment.
+    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
+    StaticOffset += Size;
+    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
+
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -StaticOffset));
+    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
+                                     Arg->getName() + ".unsafe-byval");
+
+    // Replace alloc with the new location.
+    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
+                      /*Deref=*/true, -StaticOffset);
+    Arg->replaceAllUsesWith(NewArg);
+    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
+    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   for (AllocaInst *AI : StaticAllocas) {
     IRB.SetInsertPoint(AI);
 
-    auto CArraySize = cast<ConstantInt>(AI->getArraySize());
     Type *Ty = AI->getAllocatedType();
-
-    uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
@@ -497,7 +568,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
   StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);
 
   // Update shadow stack pointer in the function epilogue.
-  IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+  IRB.SetInsertPoint(BasePointer->getNextNode());
 
   Value *StaticTop =
       IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset),
@@ -609,6 +680,7 @@ bool SafeStack::runOnFunction(Function &F) {
 
   SmallVector<AllocaInst *, 16> StaticAllocas;
   SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<Argument *, 4> ByValArguments;
   SmallVector<ReturnInst *, 4> Returns;
 
   // Collect all points where stack gets unwound and needs to be restored
@@ -620,13 +692,15 @@ bool SafeStack::runOnFunction(Function &F) {
 
   // Find all static and dynamic alloca instructions that must be moved to the
   // unsafe stack, all return instructions and stack restore points.
-  findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints);
+  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns,
+            StackRestorePoints);
 
   if (StaticAllocas.empty() && DynamicAllocas.empty() &&
-      StackRestorePoints.empty())
+      ByValArguments.empty() && StackRestorePoints.empty())
     return false; // Nothing to do in this function.
 
-  if (!StaticAllocas.empty() || !DynamicAllocas.empty())
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty() ||
+      !ByValArguments.empty())
     ++NumUnsafeStackFunctions; // This function has the unsafe stack.
 
   if (!StackRestorePoints.empty())
@@ -636,7 +710,8 @@ bool SafeStack::runOnFunction(Function &F) {
   UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);
 
   // The top of the unsafe stack after all unsafe static allocas are allocated.
-  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, Returns);
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas,
+                                                    ByValArguments, Returns);
 
   // Safe stack object that stores the current unsafe stack top. It is updated
   // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 3430ee01035e..cbdcc5e6cfe1 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -31,7 +31,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -156,7 +156,9 @@ class SanitizerCoverageModule : public ModulePass {
   void SetNoSanitizeMetadata(Instruction *I);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
   unsigned NumberOfInstrumentedBlocks() {
-    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses();
+    return SanCovFunction->getNumUses() +
+           SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
+           SanCovTraceEnter->getNumUses();
   }
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
@@ -211,12 +213,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
-  if (Options.TraceBB) {
-    SanCovTraceEnter = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
-    SanCovTraceBB = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
-  }
+  SanCovTraceEnter = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
+  SanCovTraceBB = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -466,7 +466,9 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
       ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
-  if (UseCalls) {
+  if (Options.TraceBB) {
+    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+  } else if (UseCalls) {
     IRB.CreateCall(SanCovWithCheckFunction, GuardP);
   } else {
     LoadInst *Load = IRB.CreateLoad(GuardP);
@@ -495,13 +497,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     SetNoSanitizeMetadata(LI);
     SetNoSanitizeMetadata(SI);
   }
-
-  if (Options.TraceBB) {
-    // Experimental support for tracing.
-    // Insert a callback with the same guard variable as used for coverage.
-    IRB.SetInsertPoint(&*IP);
-    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
-  }
 }
 
 char SanitizerCoverageModule::ID = 0;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index d7e02b16a28e..686bd4071104 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -307,27 +307,31 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
 /// processCallSite - Infer nonnull attributes for the arguments at the
 /// specified callsite.
 bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
-  bool Changed = false;
-
+  SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
+
   for (Value *V : CS.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
 
     if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
-                            CS.getInstruction()) == LazyValueInfo::False) {
-      AttributeSet AS = CS.getAttributes();
-      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo + 1,
-                           Attribute::NonNull);
-      CS.setAttributes(AS);
-      Changed = true;
-    }
+                            CS.getInstruction()) == LazyValueInfo::False)
+      Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
+
   assert(ArgNo == CS.arg_size() && "sanity check");
 
-  return Changed;
+  if (Indices.empty())
+    return false;
+
+  AttributeSet AS = CS.getAttributes();
+  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+  CS.setAttributes(AS);
+
+  return true;
 }
 
 Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a3658ed64976..9dc41ba2f328 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -257,6 +257,10 @@ static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
 }
 
 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+  // Don't touch volatile stores.
+  if (!SI->isSimple())
+    return false;
+
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
@@ -287,10 +291,6 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
     if (!SI)
       continue;
 
-    // Don't touch volatile stores.
-    if (!SI->isSimple())
-      continue;
-
     // Make sure this is a strided store with a constant stride.
     if (!isLegalStore(SI))
       continue;
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0bd5fa9f8777..ba79b32ac3d5 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -20,8 +20,8 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
@@ -1136,9 +1136,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                      DIBuilder &Builder, bool Deref, int Offset) {
-  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+                             Instruction *InsertBefore, DIBuilder &Builder,
+                             bool Deref, int Offset) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address);
   if (!DDI)
     return false;
   DebugLoc Loc = DDI->getDebugLoc();
@@ -1168,12 +1169,17 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
-  Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc,
-                        AI->getNextNode());
+  Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
   DDI->eraseFromParent();
   return true;
 }
 
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder, bool Deref, int Offset) {
+  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder,
+                           Deref, Offset);
+}
+
 /// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
 static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 47e587fab7b6..83afb1a65ac0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1322,6 +1322,15 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFMul(OpC->getArgOperand(1),
       EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
+
+  // log(exp2(y)) -> y*log(2)
+  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
+      TLI->has(Func) && Func == LibFunc::exp2)
+    return B.CreateFMul(
+        OpC->getArgOperand(0),
+        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+                             Callee->getName(), B, Callee->getAttributes()),
+        "logmul");
   return Ret;
 }
 
@@ -2301,7 +2310,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 // log, logf, logl:
 //   * log(exp(x))   -> x
 //   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
 //   * log(exp10(y)) -> y*log(10)
 //   * log(sqrt(x))  -> 0.5*log(x)
 //
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 0a63c1d5153c..00a8984845dd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     if (Value *NewV =
             Materializer->materializeDeclFor(const_cast<Value *>(V))) {
       VM[V] = NewV;
-      if (auto *GV = dyn_cast<GlobalValue>(V))
-        Materializer->materializeInitFor(cast<GlobalValue>(NewV),
-                                         const_cast<GlobalValue *>(GV));
+      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
+        Materializer->materializeInitFor(
+            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
       return NewV;
     }
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c956a55a1009..c5b8b5b073d6 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5188,7 +5188,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
         continue;
       }
 
-      // Count the number of live interals.
+      // Count the number of live intevals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals)
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll
new file mode 100644
index 000000000000..c328d7686466
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -0,0 +1,110 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; CHECK: 'extractelement_v2i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
+define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2f32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
+define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
+  %elt = extractelement <2 x float> %vec, i32 1
+  store float %elt, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v3i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
+define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <3 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
+define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v8i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
+define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be non-0
+; CHECK: 'extractelement_v8i32_dynindex'
+; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
+define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i32> %vec, i32 %idx
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
+define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v3i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
+define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <3 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
+define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v8i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
+define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i8'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
+define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i8> %vec, i8 1
+  store i8 %elt, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2i16'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i16> %vec, i16 1
+  store i16 %elt, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/Analysis/CostModel/AMDGPU/lit.local.cfg b/test/Analysis/CostModel/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
index fb16af635f07..c518587c0e1a 100644
--- a/test/Analysis/CostModel/X86/cast.ll
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -84,11 +84,11 @@ define i32 @zext_sext(<8 x i1> %in) {
   ;CHECK-AVX: cost of 4 {{.*}} zext
   %D = zext <4 x i32> undef to <4 x i64>
 
-  ;CHECK-AVX512: cost of 3 {{.*}} %D1 = zext
-  %D1 = zext <16 x i32> undef to <16 x i64>
+  ;CHECK-AVX512: cost of 1 {{.*}} %D1 = zext
+  %D1 = zext <8 x i32> undef to <8 x i64>
 
-  ;CHECK-AVX512: cost of 3 {{.*}} %D2 = sext
-  %D2 = sext <16 x i32> undef to <16 x i64>
+  ;CHECK-AVX512: cost of 1 {{.*}} %D2 = sext
+  %D2 = sext <8 x i32> undef to <8 x i64>
 
   ;CHECK-AVX512: cost of 1 {{.*}} %D3 = zext
   %D3 = zext <16 x i16> undef to <16 x i32>
@@ -118,9 +118,11 @@ define i32 @zext_sext(<8 x i1> %in) {
   ;CHECK_AVX512: cost of 1 {{.*}} G = trunc
   %G = trunc <8 x i64> undef to <8 x i32>
 
-  ;CHECK-AVX512: cost of 4 {{.*}} %G1 = trunc
-  %G1 = trunc <16 x i64> undef to <16 x i32>
+  ;CHECK-AVX512: cost of 1 {{.*}} %G1 = trunc
+  %G1 = trunc <16 x i32> undef to <16 x i16>
 
+  ;CHECK-AVX512: cost of 1 {{.*}} %G2 = trunc
+  %G2 = trunc <16 x i32> undef to <16 x i8>
   ret i32 undef
 }
 
@@ -207,38 +209,40 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
   ; CHECK: cost of 2 {{.*}} uitofp
   %C2 = uitofp <4 x i16> %c to <4 x double>
 
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %D1 = uitofp <4 x i32> %d to <4 x float>
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %D2 = uitofp <4 x i32> %d to <4 x double>
   ret void
 }
 
 define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
 ; CHECK-LABEL: for function 'uitofp8'
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %A1 = uitofp <8 x i1> %a to <8 x float>
 
-  ; CHECK: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 2 {{.*}} uitofp
   %B1 = uitofp <8 x i8> %b to <8 x float>
 
-  ; CHECK: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 2 {{.*}} uitofp
   %C1 = uitofp <8 x i16> %c to <8 x float>
 
   ; CHECK-AVX2: cost of 8 {{.*}} uitofp
-  ; CHECK-AVX512: cost of 8 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 1 {{.*}} uitofp
   ; CHECK-AVX: cost of 9 {{.*}} uitofp
   %D1 = uitofp <8 x i32> %d to <8 x float>
   ret void
 }
 
-define void @fp_conv(<8 x float> %a, <16 x float>%b) {
+define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
 ;CHECK-LABEL: for function 'fp_conv'
   ; CHECK-AVX512: cost of 1 {{.*}} fpext
   %A1 = fpext <8 x float> %a to <8 x double>
 
-  ; CHECK-AVX512: cost of 3 {{.*}} fpext
-  %A2 = fpext <16 x float> %b to <16 x double>
+  ; CHECK-AVX512: cost of 1 {{.*}} fpext
+  %A2 = fpext <4 x float> %c to <4 x double>
 
   ; CHECK-AVX2:   cost of 3 {{.*}} %A3 = fpext
   ; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext
@@ -248,7 +252,7 @@ define void @fp_conv(<8 x float> %a, <16 x float>%b) {
   ; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc
   %A4 = fptrunc <8 x double> undef to <8 x float>
 
-  ; CHECK-AVX512: cost of 3 {{.*}} %A5 = fptrunc
-  %A5 = fptrunc <16 x double> undef to <16 x float>
+  ; CHECK-AVX512: cost of 1 {{.*}} %A5 = fptrunc
+  %A5 = fptrunc <4 x double> undef to <4 x float>
   ret void
 }
diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index dcd0088d0df7..9913a4896912 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll
@@ -4,656 +4,656 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
 
 define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2: sitofpv2i8v2double
+  ; SSE2-LABEL: sitofpv2i8v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i8v2double
+  ; AVX1-LABEL: sitofpv2i8v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i8v2double
+  ; AVX2-LABEL: sitofpv2i8v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i8v2double
+  ; AVX512F-LABEL: sitofpv2i8v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i8> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2: sitofpv4i8v4double
+  ; SSE2-LABEL: sitofpv4i8v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i8v4double
+  ; AVX1-LABEL: sitofpv4i8v4double
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i8v4double
+  ; AVX2-LABEL: sitofpv4i8v4double
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i8v4double
+  ; AVX512F-LABEL: sitofpv4i8v4double
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i8> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2: sitofpv8i8v8double
+  ; SSE2-LABEL: sitofpv8i8v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i8v8double
+  ; AVX1-LABEL: sitofpv8i8v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i8v8double
+  ; AVX2-LABEL: sitofpv8i8v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i8v8double
+  ; AVX512F-LABEL: sitofpv8i8v8double
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2: sitofpv16i8v16double
+  ; SSE2-LABEL: sitofpv16i8v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i8v16double
+  ; AVX1-LABEL: sitofpv16i8v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i8v16double
+  ; AVX2-LABEL: sitofpv16i8v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i8v16double
+  ; AVX512F-LABEL: sitofpv16i8v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2: sitofpv32i8v32double
+  ; SSE2-LABEL: sitofpv32i8v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i8v32double
+  ; AVX1-LABEL: sitofpv32i8v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i8v32double
+  ; AVX2-LABEL: sitofpv32i8v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i8v32double
+  ; AVX512F-LABEL: sitofpv32i8v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i8> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2: sitofpv2i16v2double
+  ; SSE2-LABEL: sitofpv2i16v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i16v2double
+  ; AVX1-LABEL: sitofpv2i16v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i16v2double
+  ; AVX2-LABEL: sitofpv2i16v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i16v2double
+  ; AVX512F-LABEL: sitofpv2i16v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i16> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2: sitofpv4i16v4double
+  ; SSE2-LABEL: sitofpv4i16v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i16v4double
+  ; AVX1-LABEL: sitofpv4i16v4double
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i16v4double
+  ; AVX2-LABEL: sitofpv4i16v4double
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i16v4double
+  ; AVX512F-LABEL: sitofpv4i16v4double
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i16> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2: sitofpv8i16v8double
+  ; SSE2-LABEL: sitofpv8i16v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i16v8double
+  ; AVX1-LABEL: sitofpv8i16v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i16v8double
+  ; AVX2-LABEL: sitofpv8i16v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i16v8double
+  ; AVX512F-LABEL: sitofpv8i16v8double
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2: sitofpv16i16v16double
+  ; SSE2-LABEL: sitofpv16i16v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i16v16double
+  ; AVX1-LABEL: sitofpv16i16v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i16v16double
+  ; AVX2-LABEL: sitofpv16i16v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i16v16double
+  ; AVX512F-LABEL: sitofpv16i16v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2: sitofpv32i16v32double
+  ; SSE2-LABEL: sitofpv32i16v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i16v32double
+  ; AVX1-LABEL: sitofpv32i16v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i16v32double
+  ; AVX2-LABEL: sitofpv32i16v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i16v32double
+  ; AVX512F-LABEL: sitofpv32i16v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i16> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2: sitofpv2i32v2double
+  ; SSE2-LABEL: sitofpv2i32v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i32v2double
+  ; AVX1-LABEL: sitofpv2i32v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i32v2double
+  ; AVX2-LABEL: sitofpv2i32v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i32v2double
+  ; AVX512F-LABEL: sitofpv2i32v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i32> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2: sitofpv4i32v4double
+  ; SSE2-LABEL: sitofpv4i32v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i32v4double
+  ; AVX1-LABEL: sitofpv4i32v4double
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i32v4double
+  ; AVX2-LABEL: sitofpv4i32v4double
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i32v4double
+  ; AVX512F-LABEL: sitofpv4i32v4double
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2: sitofpv8i32v8double
+  ; SSE2-LABEL: sitofpv8i32v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i32v8double
+  ; AVX1-LABEL: sitofpv8i32v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i32v8double
+  ; AVX2-LABEL: sitofpv8i32v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i32v8double
+  ; AVX512F-LABEL: sitofpv8i32v8double
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2: sitofpv16i32v16double
+  ; SSE2-LABEL: sitofpv16i32v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i32v16double
+  ; AVX1-LABEL: sitofpv16i32v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i32v16double
+  ; AVX2-LABEL: sitofpv16i32v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i32v16double
+  ; AVX512F-LABEL: sitofpv16i32v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2: sitofpv32i32v32double
+  ; SSE2-LABEL: sitofpv32i32v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i32v32double
+  ; AVX1-LABEL: sitofpv32i32v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i32v32double
+  ; AVX2-LABEL: sitofpv32i32v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i32v32double
+  ; AVX512F-LABEL: sitofpv32i32v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i32> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2: sitofpv2i64v2double
+  ; SSE2-LABEL: sitofpv2i64v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i64v2double
+  ; AVX1-LABEL: sitofpv2i64v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i64v2double
+  ; AVX2-LABEL: sitofpv2i64v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i64v2double
+  ; AVX512F-LABEL: sitofpv2i64v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2: sitofpv4i64v4double
+  ; SSE2-LABEL: sitofpv4i64v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i64v4double
+  ; AVX1-LABEL: sitofpv4i64v4double
   ; AVX1: cost of 10 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i64v4double
+  ; AVX2-LABEL: sitofpv4i64v4double
   ; AVX2: cost of 10 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i64v4double
+  ; AVX512F-LABEL: sitofpv4i64v4double
   ; AVX512F: cost of 10 {{.*}} sitofp
   %1 = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2: sitofpv8i64v8double
+  ; SSE2-LABEL: sitofpv8i64v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i64v8double
+  ; AVX1-LABEL: sitofpv8i64v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i64v8double
+  ; AVX2-LABEL: sitofpv8i64v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i64v8double
+  ; AVX512F-LABEL: sitofpv8i64v8double
   ; AVX512F: cost of 22 {{.*}} sitofp
   %1 = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2: sitofpv16i64v16double
+  ; SSE2-LABEL: sitofpv16i64v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i64v16double
+  ; AVX1-LABEL: sitofpv16i64v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i64v16double
+  ; AVX2-LABEL: sitofpv16i64v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i64v16double
+  ; AVX512F-LABEL: sitofpv16i64v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2: sitofpv32i64v32double
+  ; SSE2-LABEL: sitofpv32i64v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i64v32double
+  ; AVX1-LABEL: sitofpv32i64v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i64v32double
+  ; AVX2-LABEL: sitofpv32i64v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i64v32double
+  ; AVX512F-LABEL: sitofpv32i64v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i64> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x float> @sitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2: sitofpv2i8v2float
+  ; SSE2-LABEL: sitofpv2i8v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i8v2float
+  ; AVX1-LABEL: sitofpv2i8v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i8v2float
+  ; AVX2-LABEL: sitofpv2i8v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i8v2float
+  ; AVX512F-LABEL: sitofpv2i8v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i8> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2: sitofpv4i8v4float
+  ; SSE2-LABEL: sitofpv4i8v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i8v4float
+  ; AVX1-LABEL: sitofpv4i8v4float
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i8v4float
+  ; AVX2-LABEL: sitofpv4i8v4float
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i8v4float
+  ; AVX512F-LABEL: sitofpv4i8v4float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i8> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2: sitofpv8i8v8float
+  ; SSE2-LABEL: sitofpv8i8v8float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i8v8float
+  ; AVX1-LABEL: sitofpv8i8v8float
   ; AVX1: cost of 8 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i8v8float
+  ; AVX2-LABEL: sitofpv8i8v8float
   ; AVX2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i8v8float
+  ; AVX512F-LABEL: sitofpv8i8v8float
   ; AVX512F: cost of 8 {{.*}} sitofp
   %1 = sitofp <8 x i8> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2: sitofpv16i8v16float
+  ; SSE2-LABEL: sitofpv16i8v16float
   ; SSE2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i8v16float
+  ; AVX1-LABEL: sitofpv16i8v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i8v16float
+  ; AVX2-LABEL: sitofpv16i8v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i8v16float
+  ; AVX512F-LABEL: sitofpv16i8v16float
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2: sitofpv32i8v32float
+  ; SSE2-LABEL: sitofpv32i8v32float
   ; SSE2: cost of 16 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i8v32float
+  ; AVX1-LABEL: sitofpv32i8v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i8v32float
+  ; AVX2-LABEL: sitofpv32i8v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i8v32float
+  ; AVX512F-LABEL: sitofpv32i8v32float
   ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i8> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2: sitofpv2i16v2float
+  ; SSE2-LABEL: sitofpv2i16v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i16v2float
+  ; AVX1-LABEL: sitofpv2i16v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i16v2float
+  ; AVX2-LABEL: sitofpv2i16v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i16v2float
+  ; AVX512F-LABEL: sitofpv2i16v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i16> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2: sitofpv4i16v4float
+  ; SSE2-LABEL: sitofpv4i16v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i16v4float
+  ; AVX1-LABEL: sitofpv4i16v4float
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i16v4float
+  ; AVX2-LABEL: sitofpv4i16v4float
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i16v4float
+  ; AVX512F-LABEL: sitofpv4i16v4float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i16> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2: sitofpv8i16v8float
+  ; SSE2-LABEL: sitofpv8i16v8float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i16v8float
+  ; AVX1-LABEL: sitofpv8i16v8float
   ; AVX1: cost of 5 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i16v8float
+  ; AVX2-LABEL: sitofpv8i16v8float
   ; AVX2: cost of 5 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i16v8float
+  ; AVX512F-LABEL: sitofpv8i16v8float
   ; AVX512F: cost of 5 {{.*}} sitofp
   %1 = sitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2: sitofpv16i16v16float
+  ; SSE2-LABEL: sitofpv16i16v16float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i16v16float
+  ; AVX1-LABEL: sitofpv16i16v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i16v16float
+  ; AVX2-LABEL: sitofpv16i16v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i16v16float
+  ; AVX512F-LABEL: sitofpv16i16v16float
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2: sitofpv32i16v32float
+  ; SSE2-LABEL: sitofpv32i16v32float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i16v32float
+  ; AVX1-LABEL: sitofpv32i16v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i16v32float
+  ; AVX2-LABEL: sitofpv32i16v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i16v32float
-  ; AVX512F: cost of 2 {{.*}} sitofp
+  ; AVX512F-LABEL: sitofpv32i16v32float
+  ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i16> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2: sitofpv2i32v2float
+  ; SSE2-LABEL: sitofpv2i32v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i32v2float
+  ; AVX1-LABEL: sitofpv2i32v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i32v2float
+  ; AVX2-LABEL: sitofpv2i32v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i32v2float
+  ; AVX512F-LABEL: sitofpv2i32v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i32> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2: sitofpv4i32v4float
+  ; SSE2-LABEL: sitofpv4i32v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i32v4float
+  ; AVX1-LABEL: sitofpv4i32v4float
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i32v4float
+  ; AVX2-LABEL: sitofpv4i32v4float
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i32v4float
+  ; AVX512F-LABEL: sitofpv4i32v4float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2: sitofpv8i32v8float
+  ; SSE2-LABEL: sitofpv8i32v8float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i32v8float
+  ; AVX1-LABEL: sitofpv8i32v8float
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i32v8float
+  ; AVX2-LABEL: sitofpv8i32v8float
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i32v8float
+  ; AVX512F-LABEL: sitofpv8i32v8float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2: sitofpv16i32v16float
+  ; SSE2-LABEL: sitofpv16i32v16float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i32v16float
+  ; AVX1-LABEL: sitofpv16i32v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i32v16float
+  ; AVX2-LABEL: sitofpv16i32v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i32v16float
+  ; AVX512F-LABEL: sitofpv16i32v16float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2: sitofpv32i32v32float
+  ; SSE2-LABEL: sitofpv32i32v32float
   ; SSE2: cost of 120 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i32v32float
+  ; AVX1-LABEL: sitofpv32i32v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i32v32float
+  ; AVX2-LABEL: sitofpv32i32v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i32v32float
-  ; AVX512F: cost of 1 {{.*}} sitofp
+  ; AVX512F-LABEL: sitofpv32i32v32float
+  ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i32> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2: sitofpv2i64v2float
+  ; SSE2-LABEL: sitofpv2i64v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i64v2float
+  ; AVX1-LABEL: sitofpv2i64v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i64v2float
+  ; AVX2-LABEL: sitofpv2i64v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i64v2float
+  ; AVX512F-LABEL: sitofpv2i64v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2: sitofpv4i64v4float
+  ; SSE2-LABEL: sitofpv4i64v4float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i64v4float
+  ; AVX1-LABEL: sitofpv4i64v4float
   ; AVX1: cost of 10 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i64v4float
+  ; AVX2-LABEL: sitofpv4i64v4float
   ; AVX2: cost of 10 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i64v4float
+  ; AVX512F-LABEL: sitofpv4i64v4float
   ; AVX512F: cost of 10 {{.*}} sitofp
   %1 = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2: sitofpv8i64v8float
+  ; SSE2-LABEL: sitofpv8i64v8float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i64v8float
+  ; AVX1-LABEL: sitofpv8i64v8float
   ; AVX1: cost of 22 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i64v8float
+  ; AVX2-LABEL: sitofpv8i64v8float
   ; AVX2: cost of 22 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i64v8float
+  ; AVX512F-LABEL: sitofpv8i64v8float
   ; AVX512F: cost of 22 {{.*}} sitofp
   %1 = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2: sitofpv16i64v16float
+  ; SSE2-LABEL: sitofpv16i64v16float
   ; SSE2: cost of 120 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i64v16float
+  ; AVX1-LABEL: sitofpv16i64v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i64v16float
+  ; AVX2-LABEL: sitofpv16i64v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i64v16float
+  ; AVX512F-LABEL: sitofpv16i64v16float
   ; AVX512F: cost of 46 {{.*}} sitofp
   %1 = sitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2: sitofpv32i64v32float
+  ; SSE2-LABEL: sitofpv32i64v32float
   ; SSE2: cost of 240 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i64v32float
+  ; AVX1-LABEL: sitofpv32i64v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i64v32float
+  ; AVX2-LABEL: sitofpv32i64v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i64v32float
+  ; AVX512F-LABEL: sitofpv32i64v32float
   ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i64> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <8 x double> @sitofpv8i1v8double(<8 x double> %a) {
-  ; SSE2: sitofpv8i1v8double
+  ; SSE2-LABEL: sitofpv8i1v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i1v8double
+  ; AVX1-LABEL: sitofpv8i1v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i1v8double
+  ; AVX2-LABEL: sitofpv8i1v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i1v8double
+  ; AVX512F-LABEL: sitofpv8i1v8double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x double>
@@ -661,16 +661,16 @@ define <8 x double> @sitofpv8i1v8double(<8 x double> %a) {
 }
 
 define <16 x float> @sitofpv16i1v16float(<16 x float> %a) {
-  ; SSE2: sitofpv16i1v16float
+  ; SSE2-LABEL: sitofpv16i1v16float
   ; SSE2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i1v16float
+  ; AVX1-LABEL: sitofpv16i1v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i1v16float
+  ; AVX2-LABEL: sitofpv16i1v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i1v16float
+  ; AVX512F-LABEL: sitofpv16i1v16float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %cmpres = fcmp ogt <16 x float> %a, zeroinitializer
   %1 = sitofp <16 x i1> %cmpres to <16 x float>
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 9ffc483e3f5a..08e36650bec4 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -2,644 +2,708 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512DQ %s
 
 define <2 x double> @uitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2: uitofpv2i8v2double
+  ; SSE2-LABEL: uitofpv2i8v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i8v2double
+  ; AVX1-LABEL: uitofpv2i8v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i8v2double
+  ; AVX2-LABEL: uitofpv2i8v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i8v2double
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i8v2double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <2 x i8> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2: uitofpv4i8v4double
+  ; SSE2-LABEL: uitofpv4i8v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i8v4double
+  ; AVX1-LABEL: uitofpv4i8v4double
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i8v4double
+  ; AVX2-LABEL: uitofpv4i8v4double
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i8v4double
+  ; AVX512F-LABEL: uitofpv4i8v4double
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i8> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2: uitofpv8i8v8double
+  ; SSE2-LABEL: uitofpv8i8v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i8v8double
+  ; AVX1-LABEL: uitofpv8i8v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i8v8double
+  ; AVX2-LABEL: uitofpv8i8v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i8v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i8v8double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2: uitofpv16i8v16double
+  ; SSE2-LABEL: uitofpv16i8v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i8v16double
+  ; AVX1-LABEL: uitofpv16i8v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i8v16double
+  ; AVX2-LABEL: uitofpv16i8v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i8v16double
+  ; AVX512F-LABEL: uitofpv16i8v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2: uitofpv32i8v32double
+  ; SSE2-LABEL: uitofpv32i8v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i8v32double
+  ; AVX1-LABEL: uitofpv32i8v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i8v32double
+  ; AVX2-LABEL: uitofpv32i8v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i8v32double
+  ; AVX512F-LABEL: uitofpv32i8v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i8> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2: uitofpv2i16v2double
+  ; SSE2-LABEL: uitofpv2i16v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i16v2double
+  ; AVX1-LABEL: uitofpv2i16v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i16v2double
+  ; AVX2-LABEL: uitofpv2i16v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i16v2double
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i16v2double
+  ; AVX512F: cost of 5 {{.*}} uitofp
   %1 = uitofp <2 x i16> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2: uitofpv4i16v4double
+  ; SSE2-LABEL: uitofpv4i16v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i16v4double
+  ; AVX1-LABEL: uitofpv4i16v4double
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i16v4double
+  ; AVX2-LABEL: uitofpv4i16v4double
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i16v4double
+  ; AVX512F-LABEL: uitofpv4i16v4double
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i16> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2: uitofpv8i16v8double
+  ; SSE2-LABEL: uitofpv8i16v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i16v8double
+  ; AVX1-LABEL: uitofpv8i16v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i16v8double
+  ; AVX2-LABEL: uitofpv8i16v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i16v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i16v8double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2: uitofpv16i16v16double
+  ; SSE2-LABEL: uitofpv16i16v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i16v16double
+  ; AVX1-LABEL: uitofpv16i16v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i16v16double
+  ; AVX2-LABEL: uitofpv16i16v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i16v16double
+  ; AVX512F-LABEL: uitofpv16i16v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2: uitofpv32i16v32double
+  ; SSE2-LABEL: uitofpv32i16v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i16v32double
+  ; AVX1-LABEL: uitofpv32i16v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i16v32double
+  ; AVX2-LABEL: uitofpv32i16v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i16v32double
+  ; AVX512F-LABEL: uitofpv32i16v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i16> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2: uitofpv2i32v2double
+  ; SSE2-LABEL: uitofpv2i32v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i32v2double
+  ; AVX1-LABEL: uitofpv2i32v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i32v2double
+  ; AVX2-LABEL: uitofpv2i32v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i32v2double
+  ; AVX512F-LABEL: uitofpv2i32v2double
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i32> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2: uitofpv4i32v4double
+  ; SSE2-LABEL: uitofpv4i32v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i32v4double
+  ; AVX1-LABEL: uitofpv4i32v4double
   ; AVX1: cost of 6 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i32v4double
+  ; AVX2-LABEL: uitofpv4i32v4double
   ; AVX2: cost of 6 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i32v4double
-  ; AVX512F: cost of 6 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i32v4double
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2: uitofpv8i32v8double
+  ; SSE2-LABEL: uitofpv8i32v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i32v8double
+  ; AVX1-LABEL: uitofpv8i32v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i32v8double
+  ; AVX2-LABEL: uitofpv8i32v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i32v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i32v8double
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2: uitofpv16i32v16double
+  ; SSE2-LABEL: uitofpv16i32v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i32v16double
+  ; AVX1-LABEL: uitofpv16i32v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i32v16double
+  ; AVX2-LABEL: uitofpv16i32v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i32v16double
+  ; AVX512F-LABEL: uitofpv16i32v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2: uitofpv32i32v32double
+  ; SSE2-LABEL: uitofpv32i32v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i32v32double
+  ; AVX1-LABEL: uitofpv32i32v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i32v32double
+  ; AVX2-LABEL: uitofpv32i32v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i32v32double
+  ; AVX512F-LABEL: uitofpv32i32v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i32> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2: uitofpv2i64v2double
+  ; SSE2-LABEL: uitofpv2i64v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i64v2double
+  ; AVX1-LABEL: uitofpv2i64v2double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i64v2double
+  ; AVX2-LABEL: uitofpv2i64v2double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i64v2double
-  ; AVX512F: cost of 20 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i64v2double
+  ; AVX512F: cost of 5 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv2i64v2double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2: uitofpv4i64v4double
+  ; SSE2-LABEL: uitofpv4i64v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i64v4double
+  ; AVX1-LABEL: uitofpv4i64v4double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i64v4double
+  ; AVX2-LABEL: uitofpv4i64v4double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i64v4double
-  ; AVX512F: cost of 40 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i64v4double
+  ; AVX512F: cost of 12 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv4i64v4double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2: uitofpv8i64v8double
+  ; SSE2-LABEL: uitofpv8i64v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i64v8double
+  ; AVX1-LABEL: uitofpv8i64v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i64v8double
+  ; AVX2-LABEL: uitofpv8i64v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i64v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i64v8double
+  ; AVX512F: cost of 26 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv8i64v8double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2: uitofpv16i64v16double
+  ; SSE2-LABEL: uitofpv16i64v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i64v16double
+  ; AVX1-LABEL: uitofpv16i64v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i64v16double
+  ; AVX2-LABEL: uitofpv16i64v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i64v16double
+  ; AVX512F-LABEL: uitofpv16i64v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv16i64v16double
+  ; AVX512DQ: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2: uitofpv32i64v32double
+  ; SSE2-LABEL: uitofpv32i64v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i64v32double
+  ; AVX1-LABEL: uitofpv32i64v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i64v32double
+  ; AVX2-LABEL: uitofpv32i64v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i64v32double
+  ; AVX512F-LABEL: uitofpv32i64v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv32i64v32double
+  ; AVX512DQ: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i64> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2: uitofpv2i8v2float
+  ; SSE2-LABEL: uitofpv2i8v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i8v2float
+  ; AVX1-LABEL: uitofpv2i8v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i8v2float
+  ; AVX2-LABEL: uitofpv2i8v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i8v2float
+  ; AVX512F-LABEL: uitofpv2i8v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i8> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2: uitofpv4i8v4float
+  ; SSE2-LABEL: uitofpv4i8v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i8v4float
+  ; AVX1-LABEL: uitofpv4i8v4float
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i8v4float
+  ; AVX2-LABEL: uitofpv4i8v4float
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i8v4float
+  ; AVX512F-LABEL: uitofpv4i8v4float
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i8> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2: uitofpv8i8v8float
+  ; SSE2-LABEL: uitofpv8i8v8float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i8v8float
+  ; AVX1-LABEL: uitofpv8i8v8float
   ; AVX1: cost of 5 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i8v8float
+  ; AVX2-LABEL: uitofpv8i8v8float
   ; AVX2: cost of 5 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i8v8float
-  ; AVX512F: cost of 5 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i8v8float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i8> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2: uitofpv16i8v16float
+  ; SSE2-LABEL: uitofpv16i8v16float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i8v16float
+  ; AVX1-LABEL: uitofpv16i8v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i8v16float
+  ; AVX2-LABEL: uitofpv16i8v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i8v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i8v16float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2: uitofpv32i8v32float
+  ; SSE2-LABEL: uitofpv32i8v32float
   ; SSE2: cost of 16 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i8v32float
+  ; AVX1-LABEL: uitofpv32i8v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i8v32float
+  ; AVX2-LABEL: uitofpv32i8v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i8v32float
+  ; AVX512F-LABEL: uitofpv32i8v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i8> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2: uitofpv2i16v2float
+  ; SSE2-LABEL: uitofpv2i16v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i16v2float
+  ; AVX1-LABEL: uitofpv2i16v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i16v2float
+  ; AVX2-LABEL: uitofpv2i16v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i16v2float
+  ; AVX512F-LABEL: uitofpv2i16v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i16> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2: uitofpv4i16v4float
+  ; SSE2-LABEL: uitofpv4i16v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i16v4float
+  ; AVX1-LABEL: uitofpv4i16v4float
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i16v4float
+  ; AVX2-LABEL: uitofpv4i16v4float
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i16v4float
+  ; AVX512F-LABEL: uitofpv4i16v4float
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i16> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2: uitofpv8i16v8float
+  ; SSE2-LABEL: uitofpv8i16v8float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i16v8float
+  ; AVX1-LABEL: uitofpv8i16v8float
   ; AVX1: cost of 5 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i16v8float
+  ; AVX2-LABEL: uitofpv8i16v8float
   ; AVX2: cost of 5 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i16v8float
-  ; AVX512F: cost of 5 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i16v8float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2: uitofpv16i16v16float
+  ; SSE2-LABEL: uitofpv16i16v16float
   ; SSE2: cost of 30 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i16v16float
+  ; AVX1-LABEL: uitofpv16i16v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i16v16float
+  ; AVX2-LABEL: uitofpv16i16v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i16v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i16v16float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2: uitofpv32i16v32float
+  ; SSE2-LABEL: uitofpv32i16v32float
   ; SSE2: cost of 60 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i16v32float
+  ; AVX1-LABEL: uitofpv32i16v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i16v32float
+  ; AVX2-LABEL: uitofpv32i16v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i16v32float
+  ; AVX512F-LABEL: uitofpv32i16v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i16> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2: uitofpv2i32v2float
+  ; SSE2-LABEL: uitofpv2i32v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i32v2float
+  ; AVX1-LABEL: uitofpv2i32v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i32v2float
+  ; AVX2-LABEL: uitofpv2i32v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i32v2float
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i32v2float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <2 x i32> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2: uitofpv4i32v4float
+  ; SSE2-LABEL: uitofpv4i32v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i32v4float
+  ; AVX1-LABEL: uitofpv4i32v4float
   ; AVX1: cost of 6 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i32v4float
+  ; AVX2-LABEL: uitofpv4i32v4float
   ; AVX2: cost of 6 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i32v4float
-  ; AVX512F: cost of 6 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i32v4float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2: uitofpv8i32v8float
+  ; SSE2-LABEL: uitofpv8i32v8float
   ; SSE2: cost of 16 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i32v8float
+  ; AVX1-LABEL: uitofpv8i32v8float
   ; AVX1: cost of 9 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i32v8float
+  ; AVX2-LABEL: uitofpv8i32v8float
   ; AVX2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i32v8float
-  ; AVX512F: cost of 8 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i32v8float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2: uitofpv16i32v16float
+  ; SSE2-LABEL: uitofpv16i32v16float
   ; SSE2: cost of 32 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i32v16float
+  ; AVX1-LABEL: uitofpv16i32v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i32v16float
+  ; AVX2-LABEL: uitofpv16i32v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i32v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i32v16float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2: uitofpv32i32v32float
+  ; SSE2-LABEL: uitofpv32i32v32float
   ; SSE2: cost of 64 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i32v32float
+  ; AVX1-LABEL: uitofpv32i32v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i32v32float
+  ; AVX2-LABEL: uitofpv32i32v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i32v32float
+  ; AVX512F-LABEL: uitofpv32i32v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i32> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2: uitofpv2i64v2float
+  ; SSE2-LABEL: uitofpv2i64v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i64v2float
+  ; AVX1-LABEL: uitofpv2i64v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i64v2float
+  ; AVX2-LABEL: uitofpv2i64v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i64v2float
+  ; AVX512F-LABEL: uitofpv2i64v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2: uitofpv4i64v4float
+  ; SSE2-LABEL: uitofpv4i64v4float
   ; SSE2: cost of 30 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i64v4float
+  ; AVX1-LABEL: uitofpv4i64v4float
   ; AVX1: cost of 10 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i64v4float
+  ; AVX2-LABEL: uitofpv4i64v4float
   ; AVX2: cost of 10 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i64v4float
+  ; AVX512F-LABEL: uitofpv4i64v4float
   ; AVX512F: cost of 10 {{.*}} uitofp
   %1 = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2: uitofpv8i64v8float
+  ; SSE2-LABEL: uitofpv8i64v8float
   ; SSE2: cost of 60 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i64v8float
+  ; AVX1-LABEL: uitofpv8i64v8float
   ; AVX1: cost of 22 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i64v8float
+  ; AVX2-LABEL: uitofpv8i64v8float
   ; AVX2: cost of 22 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i64v8float
+  ; AVX512F-LABEL: uitofpv8i64v8float
   ; AVX512F: cost of 22 {{.*}} uitofp
   %1 = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2: uitofpv16i64v16float
+  ; SSE2-LABEL: uitofpv16i64v16float
   ; SSE2: cost of 120 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i64v16float
+  ; AVX1-LABEL: uitofpv16i64v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i64v16float
+  ; AVX2-LABEL: uitofpv16i64v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i64v16float
+  ; AVX512F-LABEL: uitofpv16i64v16float
   ; AVX512F: cost of 46 {{.*}} uitofp
   %1 = uitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2: uitofpv32i64v32float
+  ; SSE2-LABEL: uitofpv32i64v32float
   ; SSE2: cost of 240 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i64v32float
+  ; AVX1-LABEL: uitofpv32i64v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i64v32float
+  ; AVX2-LABEL: uitofpv32i64v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i64v32float
+  ; AVX512F-LABEL: uitofpv32i64v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i64> %a to <32 x float>
   ret <32 x float> %1
 }
 
+define <8 x i32> @fptouiv8f32v8i32(<8 x float> %a) {
+  ; AVX512F-LABEL: fptouiv8f32v8i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @fptouiv4f32v4i32(<4 x float> %a) {
+  ; AVX512F-LABEL: fptouiv4f32v4i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i32> @fptouiv2f32v2i32(<2 x float> %a) {
+  ; AVX512F-LABEL: fptouiv2f32v2i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <16 x i32> @fptouiv16f32v16i32(<16 x float> %a) {
+  ; AVX512F-LABEL: fptouiv16f32v16i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @fptouiv8f32v8i64(<8 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv8f32v8i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <8 x float> %a to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <4 x i64> @fptouiv4f32v4i64(<4 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv4f32v4i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <4 x float> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @fptouiv2f32v2i64(<2 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv2f32v2i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <2 x float> %a to <2 x i64>
+  ret <2 x i64> %1
+}
diff --git a/test/Analysis/CostModel/X86/vector_gep.ll b/test/Analysis/CostModel/X86/vector_gep.ll
new file mode 100644
index 000000000000..e49f25871d66
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vector_gep.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-linux-unknown-unknown -mattr=+avx512f | FileCheck %s
+
+%struct.S = type { [1000 x i32] }
+
+
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
+  %temp = insertelement <4 x i64> undef, i64 %base, i32 0
+  %vector = shufflevector <4 x i64> %temp, <4 x i64> undef, <4 x i32> zeroinitializer
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds %struct.S
+  %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32]
+  %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector
+  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9bee504efece..138450ba8e02 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -39,6 +39,7 @@ set(LLVM_TEST_DEPENDS
           llvm-dis
           llvm-dsymutil
           llvm-dwarfdump
+          llvm-dwp
           llvm-extract
           llvm-lib
           llvm-link
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index a76cf74a6d0c..44c24c51f0df 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) {
    ret i128 %r
 }
 
-define i128 @atomic_load_relaxed(i128* %p) {
+define i128 @atomic_load_relaxed(i64, i64, i128* %p) {
 ; CHECK-LABEL: atomic_load_relaxed:
 ; CHECK-NOT: dmb
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2]
+; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
 ; CHECK-NOT: dmb
    %r = load atomic i128, i128* %p monotonic, align 16
    ret i128 %r
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
index d5baf16bdd5c..ad89d3ff711b 100644
--- a/test/CodeGen/AArch64/arm64-long-shift.ll
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -2,18 +2,20 @@
 
 define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: shl:
-; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsr  [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq
+; CHECK: lsl  [[HI_FOR_HI:x[0-9]+]], x1, x2
+; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsl  [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge
+; CHECK: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK: ret
 
   %shl = shl i128 %r, %s
   ret i128 %shl
@@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: ashr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: asr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = ashr i128 %r, %s
   ret i128 %shr
@@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: lshr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = lshr i128 %r, %s
   ret i128 %shr
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 9df51dcc4478..509b547a5c82 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -215,3 +215,25 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
 
   ret void
 }
+
+; Tests when all the bits from one operand are not useful
+define i32 @test_nouseful_bits(i8 %a, i32 %b) {
+; CHECK-LABEL: test_nouseful_bits:
+; CHECK: bfi
+; CHECK: bfi
+; CHECK: bfi
+; CHECK-NOT: bfi
+; CHECK-NOT: or
+; CHECK: lsl
+  %conv = zext i8 %a to i32     ;   0  0  0  A
+  %shl = shl i32 %b, 8          ;   B2 B1 B0 0
+  %or = or i32 %conv, %shl      ;   B2 B1 B0 A
+  %shl.1 = shl i32 %or, 8       ;   B1 B0 A 0
+  %or.1 = or i32 %conv, %shl.1  ;   B1 B0 A A
+  %shl.2 = shl i32 %or.1, 8     ;   B0 A A 0
+  %or.2 = or i32 %conv, %shl.2  ;   B0 A A A
+  %shl.3 = shl i32 %or.2, 8     ;   A A A 0
+  %or.3 = or i32 %conv, %shl.3  ;   A A A A
+  %shl.4 = shl i32 %or.3, 8     ;   A A A 0
+  ret i32 %shl.4
+}
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 1266842fcc6d..a8399f92ebe4 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -2,6 +2,7 @@
 
 
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 8b3e6dd5ad92..a397c339a2d7 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
new file mode 100644
index 000000000000..9be212feef00
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -0,0 +1,66 @@
+; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: unsupported addrspacecast not implemented
+
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 4d70ba837816..1c5bed3b905f 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -30,3 +30,69 @@ endif:
 done:
   ret void
 }
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
+; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
+; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 571685ca6aeb..4b56d6f19832 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -7,32 +7,6 @@
 ; specialize away generic pointer accesses.
 
 
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
-entry:
-  %cmp = icmp ne i32 %c, 0
-  br i1 %cmp, label %local, label %global
-
-local:
-  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
-  br label %end
-
-global:
-  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  br label %end
-
-end:
-  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
-;  %val = load i32, i32 addrspace(4)* %fptr, align 4
-;  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-
-
 ; These testcases might become useless when there are optimizations to
 ; remove generic pointers.
 
@@ -150,32 +124,6 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
   ret void
 }
 
-
-
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
-; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
-  %alloca = alloca i32, i32 9, align 4
-  %x = call i32 @llvm.r600.read.tidig.x() #3
-  %pptr = getelementptr i32, i32* %alloca, i32 %x
-  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
-  ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 declare void @llvm.AMDGPU.barrier.local() #1
 declare i32 @llvm.r600.read.tidig.x() #3
 
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
new file mode 100644
index 000000000000..b6483f0970b0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF
+
+@internal_global_program = internal addrspace(1) global i32 0
+@common_global_program = common addrspace(1) global i32 0
+@external_global_program = addrspace(1) global i32 0
+
+@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent"
+@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
+@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
+
+@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
+@external_readonly = unnamed_addr addrspace(2) constant i32 0
+
+define void @test() {
+  ret void
+}
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_program
+; ASM: internal_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_program
+; ASM: common_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_program
+; ASM: external_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_agent
+; ASM: internal_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_agent
+; ASM: common_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_agent
+; ASM: external_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global internal_readonly
+; ASM: .hsarodata_readonly_agent
+; ASM: internal_readonly:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_readonly
+; ASM: .hsarodata_readonly_agent
+; ASM: external_readonly:
+; ASM: .long 0
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_program
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x100003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_agent
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x900003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Section {
+; ELF: Name: .hsarodata_readonly_agent
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0xA00002)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_READONLY (0x200000)
+; ELF: ]
+
+; ELF: Symbol {
+; ELF: Name: common_global_agent
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: common_global_program
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_agent
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_program
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_readonly
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsarodata_readonly_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_agent
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_program
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_readonly
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsarodata_readonly_agent
+; ELF: }
diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll
new file mode 100644
index 000000000000..1999dc38a6b0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+@internal_group = internal addrspace(3) global i32 undef
+@external_group = addrspace(3) global i32 undef
+
+define void @test() {
+entry:
+  store i32 0, i32 addrspace(3)* @internal_group
+  store i32 0, i32 addrspace(3)* @external_group
+  ret void
+}
+
+; HSA-NOT: internal_group:
+; HSA-NOT: external_group:
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index ab87fdbc00da..d9bb586163dc 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -38,8 +38,10 @@
 ; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: .end_amd_kernel_code_t
-; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 803b2ecced01..e9d98ac89e72 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -294,8 +294,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -311,7 +311,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -413,8 +413,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -438,8 +438,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
new file mode 100644
index 000000000000..c348a2e7980f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+
+; FIXME: align on alloca seems to be ignored for private_segment_alignment
+
+; ALL-LABEL: {{^}}large_alloca_compute_shader:
+
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+
+; GCNHSA: .amd_kernel_code_t
+
+; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
+; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
+; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
+; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0
+; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; GCNHSA: enable_sgpr_private_segment_buffer = 1
+; GCNHSA: enable_sgpr_dispatch_ptr = 0
+; GCNHSA: enable_sgpr_queue_ptr = 0
+; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
+; GCNHSA: enable_sgpr_dispatch_id = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_private_segment_size = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCNHSA: workitem_private_segment_byte_size = 0
+; GCNHSA: private_segment_alignment = 4
+; GCNHSA: .end_amd_kernel_code_t
+
+
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+
+; Scratch size = alloca size + emergency stack slot
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
new file mode 100644
index 000000000000..141ee2560152
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
+attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll
deleted file mode 100644
index e1122da78ef5..000000000000
--- a/test/CodeGen/AMDGPU/large-alloca.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-; GCN-LABEL: {{^}}large_alloca:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: ScratchSize: 32776
-define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
-  %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
-  store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
-  %load = load i32, i32* %gep1
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
new file mode 100644
index 000000000000..6dc9d050eee6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOT: 0xff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  %shl = shl i32 %dim, 24
+  %shr = lshr i32 %shl, 24
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 719f7ffe0f1c..dc95cd1ee012 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
-; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 define void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index c5aba2b76b89..cc109327d929 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_debug_value:
-; CHECK: s_load_dwordx2
-; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1
+; CHECK: s_load_dwordx2 s[4:5]
+; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
 define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
new file mode 100644
index 000000000000..f2a7256e812d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -0,0 +1,184 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
+; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
+
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xy:
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xy(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %val = mul i32 %x, %y
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xz:
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %x, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_yz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_yz(i32 addrspace(1)* %out) {
+entry:
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %y, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xyz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xyz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %xy = mul i32 %x, %y
+  %xyz = add i32 %xy, %z
+  store i32 %xyz, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.x() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.y() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.z() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
index 7f31ef45b628..6b52b80ba082 100644
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -10,7 +10,7 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 8
 ; GCN: .long 47180
-; GCN-NEXT: .long 38792
+; GCN-NEXT: .long 32900
 
 ; EG: {{^}}local_memory_two_objects:
 
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 9494ed75bd0c..9ffb59e70920 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -9,9 +9,9 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 128
 ; SI: .long 47180
-; SI-NEXT: .long 71560
+; SI-NEXT: .long 65668
 ; CI: .long 47180
-; CI-NEXT: .long 38792
+; CI-NEXT: .long 32900
 
 ; FUNC-LABEL: {{^}}local_memory:
 
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0e15bc878650..27a8e70aae13 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -3,7 +3,7 @@
 ; register operands in the correct order when modifying the opcode of an
 ; instruction to V_ADD_I32_e32.
 
-; CHECK: %19 = V_ADD_I32_e32 %13, %12, implicit-def %vcc, implicit %exec
+; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 84652701f773..d7b35fc631eb 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -6,6 +6,16 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_wqm
+
+; Make sure not emitting unused scratch resource descriptor setup
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+
+; CHECK: s_mov_b32 m0
+
+
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
new file mode 100644
index 000000000000..cd7c78f408dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -0,0 +1,585 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
+; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+
+; This ends up using all 256 registers and requires register
+; scavenging which will fail to find an unsued register.
+
+; Check the ScratchSize to avoid regressions from spilling
+; intermediate register class copies.
+
+; FIXME: The same register is initialized to 0 for every spill.
+
+declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; GCN-LABEL: {{^}}spill_vgpr_compute:
+
+; GCN: s_mov_b32 s16, s3
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
+
+; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
+define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+bb:
+  %tmp = add i32 %arg1, %arg2
+  %tmp7 = extractelement <4 x float> %arg6, i32 0
+  %tmp8 = extractelement <4 x float> %arg6, i32 1
+  %tmp9 = extractelement <4 x float> %arg6, i32 2
+  %tmp10 = extractelement <4 x float> %arg6, i32 3
+  %tmp11 = bitcast float %arg5 to i32
+  br label %bb12
+
+bb12:                                             ; preds = %bb145, %bb
+  %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ]
+  %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ]
+  %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ]
+  %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ]
+  %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ]
+  %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ]
+  %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ]
+  %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ]
+  %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ]
+  %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ]
+  %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ]
+  %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ]
+  %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ]
+  %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ]
+  %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ]
+  %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ]
+  %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ]
+  %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ]
+  %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ]
+  %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ]
+  %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ]
+  %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ]
+  %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ]
+  %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ]
+  %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ]
+  %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ]
+  %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ]
+  %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ]
+  %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ]
+  %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ]
+  %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ]
+  %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ]
+  %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ]
+  %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ]
+  %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ]
+  %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ]
+  %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ]
+  %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ]
+  %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ]
+  %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ]
+  %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ]
+  %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ]
+  %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ]
+  %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ]
+  %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ]
+  %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ]
+  %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ]
+  %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ]
+  %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ]
+  %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ]
+  %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ]
+  %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ]
+  %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ]
+  %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ]
+  %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ]
+  %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ]
+  %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ]
+  %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ]
+  %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ]
+  %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ]
+  %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ]
+  %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ]
+  %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ]
+  %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ]
+  %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ]
+  %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ]
+  %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ]
+  %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ]
+  %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ]
+  %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ]
+  %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ]
+  %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ]
+  %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ]
+  %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ]
+  %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ]
+  %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ]
+  %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ]
+  %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ]
+  %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ]
+  %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ]
+  %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ]
+  %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ]
+  %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ]
+  %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ]
+  %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ]
+  %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ]
+  %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ]
+  %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ]
+  %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ]
+  %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ]
+  %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ]
+  %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ]
+  %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ]
+  %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ]
+  %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ]
+  %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ]
+  %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ]
+  %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ]
+  %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ]
+  %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ]
+  %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ]
+  %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ]
+  %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ]
+  %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ]
+  %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ]
+  %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ]
+  %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ]
+  %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ]
+  %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ]
+  %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ]
+  %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ]
+  %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ]
+  %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ]
+  %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ]
+  %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ]
+  %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ]
+  %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ]
+  %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ]
+  %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ]
+  %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ]
+  %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ]
+  %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ]
+  %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ]
+  %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ]
+  %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ]
+  %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ]
+  %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ]
+  %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
+  %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
+  %tmp142 = bitcast float %tmp95 to i32
+  %tmp143 = icmp sgt i32 %tmp142, 125
+  br i1 %tmp143, label %bb144, label %bb145
+
+bb144:                                            ; preds = %bb12
+  store volatile float %arg3, float addrspace(1)* %arg
+  store volatile float %tmp91, float addrspace(1)* %arg
+  store volatile float %tmp90, float addrspace(1)* %arg
+  store volatile float %tmp89, float addrspace(1)* %arg
+  store volatile float %tmp87, float addrspace(1)* %arg
+  store volatile float %tmp86, float addrspace(1)* %arg
+  store volatile float %tmp85, float addrspace(1)* %arg
+  store volatile float %tmp83, float addrspace(1)* %arg
+  store volatile float %tmp82, float addrspace(1)* %arg
+  store volatile float %tmp81, float addrspace(1)* %arg
+  store volatile float %tmp79, float addrspace(1)* %arg
+  store volatile float %tmp78, float addrspace(1)* %arg
+  store volatile float %tmp77, float addrspace(1)* %arg
+  store volatile float %tmp75, float addrspace(1)* %arg
+  store volatile float %tmp74, float addrspace(1)* %arg
+  store volatile float %tmp73, float addrspace(1)* %arg
+  store volatile float %tmp71, float addrspace(1)* %arg
+  store volatile float %tmp70, float addrspace(1)* %arg
+  store volatile float %tmp69, float addrspace(1)* %arg
+  store volatile float %tmp67, float addrspace(1)* %arg
+  store volatile float %tmp66, float addrspace(1)* %arg
+  store volatile float %tmp65, float addrspace(1)* %arg
+  store volatile float %tmp63, float addrspace(1)* %arg
+  store volatile float %tmp62, float addrspace(1)* %arg
+  store volatile float %tmp61, float addrspace(1)* %arg
+  store volatile float %tmp59, float addrspace(1)* %arg
+  store volatile float %tmp58, float addrspace(1)* %arg
+  store volatile float %tmp57, float addrspace(1)* %arg
+  store volatile float %tmp55, float addrspace(1)* %arg
+  store volatile float %tmp54, float addrspace(1)* %arg
+  store volatile float %tmp53, float addrspace(1)* %arg
+  store volatile float %tmp51, float addrspace(1)* %arg
+  store volatile float %tmp50, float addrspace(1)* %arg
+  store volatile float %tmp49, float addrspace(1)* %arg
+  store volatile float %tmp47, float addrspace(1)* %arg
+  store volatile float %tmp46, float addrspace(1)* %arg
+  store volatile float %tmp45, float addrspace(1)* %arg
+  store volatile float %tmp43, float addrspace(1)* %arg
+  store volatile float %tmp42, float addrspace(1)* %arg
+  store volatile float %tmp41, float addrspace(1)* %arg
+  store volatile float %tmp39, float addrspace(1)* %arg
+  store volatile float %tmp38, float addrspace(1)* %arg
+  store volatile float %tmp37, float addrspace(1)* %arg
+  store volatile float %tmp35, float addrspace(1)* %arg
+  store volatile float %tmp34, float addrspace(1)* %arg
+  store volatile float %tmp33, float addrspace(1)* %arg
+  store volatile float %tmp31, float addrspace(1)* %arg
+  store volatile float %tmp30, float addrspace(1)* %arg
+  store volatile float %tmp29, float addrspace(1)* %arg
+  store volatile float %tmp27, float addrspace(1)* %arg
+  store volatile float %tmp26, float addrspace(1)* %arg
+  store volatile float %tmp25, float addrspace(1)* %arg
+  store volatile float %tmp23, float addrspace(1)* %arg
+  store volatile float %tmp22, float addrspace(1)* %arg
+  store volatile float %tmp21, float addrspace(1)* %arg
+  store volatile float %tmp19, float addrspace(1)* %arg
+  store volatile float %tmp18, float addrspace(1)* %arg
+  store volatile float %tmp17, float addrspace(1)* %arg
+  store volatile float %tmp15, float addrspace(1)* %arg
+  store volatile float %tmp14, float addrspace(1)* %arg
+  store volatile float %tmp13, float addrspace(1)* %arg
+  store volatile float %tmp16, float addrspace(1)* %arg
+  store volatile float %tmp20, float addrspace(1)* %arg
+  store volatile float %tmp24, float addrspace(1)* %arg
+  store volatile float %tmp28, float addrspace(1)* %arg
+  store volatile float %tmp32, float addrspace(1)* %arg
+  store volatile float %tmp36, float addrspace(1)* %arg
+  store volatile float %tmp40, float addrspace(1)* %arg
+  store volatile float %tmp44, float addrspace(1)* %arg
+  store volatile float %tmp48, float addrspace(1)* %arg
+  store volatile float %tmp52, float addrspace(1)* %arg
+  store volatile float %tmp56, float addrspace(1)* %arg
+  store volatile float %tmp60, float addrspace(1)* %arg
+  store volatile float %tmp64, float addrspace(1)* %arg
+  store volatile float %tmp68, float addrspace(1)* %arg
+  store volatile float %tmp72, float addrspace(1)* %arg
+  store volatile float %tmp76, float addrspace(1)* %arg
+  store volatile float %tmp80, float addrspace(1)* %arg
+  store volatile float %tmp84, float addrspace(1)* %arg
+  store volatile float %tmp88, float addrspace(1)* %arg
+  store volatile float %tmp92, float addrspace(1)* %arg
+  store volatile float %tmp93, float addrspace(1)* %arg
+  store volatile float %tmp94, float addrspace(1)* %arg
+  store volatile float %tmp96, float addrspace(1)* %arg
+  store volatile float %tmp97, float addrspace(1)* %arg
+  store volatile float %tmp98, float addrspace(1)* %arg
+  store volatile float %tmp99, float addrspace(1)* %arg
+  store volatile float %tmp100, float addrspace(1)* %arg
+  store volatile float %tmp101, float addrspace(1)* %arg
+  store volatile float %tmp102, float addrspace(1)* %arg
+  store volatile float %tmp103, float addrspace(1)* %arg
+  store volatile float %tmp104, float addrspace(1)* %arg
+  store volatile float %tmp105, float addrspace(1)* %arg
+  store volatile float %tmp106, float addrspace(1)* %arg
+  store volatile float %tmp107, float addrspace(1)* %arg
+  store volatile float %tmp108, float addrspace(1)* %arg
+  store volatile float %tmp109, float addrspace(1)* %arg
+  store volatile float %tmp110, float addrspace(1)* %arg
+  store volatile float %tmp111, float addrspace(1)* %arg
+  store volatile float %tmp112, float addrspace(1)* %arg
+  store volatile float %tmp113, float addrspace(1)* %arg
+  store volatile float %tmp114, float addrspace(1)* %arg
+  store volatile float %tmp115, float addrspace(1)* %arg
+  store volatile float %tmp116, float addrspace(1)* %arg
+  store volatile float %tmp117, float addrspace(1)* %arg
+  store volatile float %tmp118, float addrspace(1)* %arg
+  store volatile float %tmp119, float addrspace(1)* %arg
+  store volatile float %tmp120, float addrspace(1)* %arg
+  store volatile float %tmp121, float addrspace(1)* %arg
+  store volatile float %tmp122, float addrspace(1)* %arg
+  store volatile float %tmp123, float addrspace(1)* %arg
+  store volatile float %tmp124, float addrspace(1)* %arg
+  store volatile float %tmp125, float addrspace(1)* %arg
+  store volatile float %tmp126, float addrspace(1)* %arg
+  store volatile float %tmp127, float addrspace(1)* %arg
+  store volatile float %tmp128, float addrspace(1)* %arg
+  store volatile float %tmp129, float addrspace(1)* %arg
+  store volatile float %tmp130, float addrspace(1)* %arg
+  store volatile float %tmp131, float addrspace(1)* %arg
+  store volatile float %tmp132, float addrspace(1)* %arg
+  store volatile float %tmp133, float addrspace(1)* %arg
+  store volatile float %tmp134, float addrspace(1)* %arg
+  store volatile float %tmp135, float addrspace(1)* %arg
+  store volatile float %tmp136, float addrspace(1)* %arg
+  store volatile float %tmp137, float addrspace(1)* %arg
+  store volatile float %tmp138, float addrspace(1)* %arg
+  store volatile float %tmp139, float addrspace(1)* %arg
+  store volatile float %arg4, float addrspace(1)* %arg
+  store volatile float %tmp7, float addrspace(1)* %arg
+  store volatile float %tmp8, float addrspace(1)* %arg
+  store volatile float %tmp9, float addrspace(1)* %arg
+  store volatile float %tmp10, float addrspace(1)* %arg
+  ret void
+
+bb145:                                            ; preds = %bb12
+  %tmp146 = bitcast float %tmp95 to i32
+  %tmp147 = bitcast float %tmp95 to i32
+  %tmp148 = add i32 %tmp11, %tmp147
+  %tmp149 = bitcast i32 %tmp148 to float
+  %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0
+  %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1
+  %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2
+  %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3
+  %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4
+  %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5
+  %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6
+  %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7
+  %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8
+  %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9
+  %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10
+  %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11
+  %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12
+  %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13
+  %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14
+  %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15
+  %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16
+  %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17
+  %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18
+  %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19
+  %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20
+  %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21
+  %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22
+  %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23
+  %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24
+  %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25
+  %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26
+  %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27
+  %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28
+  %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29
+  %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30
+  %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31
+  %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32
+  %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33
+  %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34
+  %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35
+  %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36
+  %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37
+  %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38
+  %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39
+  %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40
+  %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41
+  %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42
+  %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43
+  %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44
+  %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45
+  %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46
+  %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47
+  %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48
+  %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49
+  %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50
+  %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51
+  %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52
+  %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53
+  %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54
+  %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55
+  %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56
+  %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57
+  %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58
+  %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59
+  %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60
+  %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61
+  %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62
+  %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63
+  %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64
+  %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65
+  %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66
+  %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67
+  %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68
+  %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69
+  %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70
+  %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71
+  %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72
+  %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73
+  %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74
+  %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75
+  %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76
+  %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77
+  %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78
+  %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79
+  %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80
+  %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81
+  %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82
+  %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83
+  %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84
+  %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85
+  %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86
+  %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87
+  %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88
+  %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89
+  %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90
+  %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91
+  %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92
+  %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93
+  %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94
+  %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95
+  %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96
+  %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97
+  %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98
+  %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99
+  %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100
+  %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101
+  %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102
+  %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103
+  %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104
+  %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105
+  %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106
+  %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107
+  %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108
+  %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109
+  %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110
+  %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111
+  %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112
+  %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113
+  %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114
+  %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115
+  %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116
+  %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117
+  %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118
+  %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119
+  %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120
+  %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121
+  %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122
+  %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123
+  %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124
+  %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125
+  %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126
+  %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127
+  %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146
+  %tmp279 = extractelement <128 x float> %tmp278, i32 0
+  %tmp280 = extractelement <128 x float> %tmp278, i32 1
+  %tmp281 = extractelement <128 x float> %tmp278, i32 2
+  %tmp282 = extractelement <128 x float> %tmp278, i32 3
+  %tmp283 = extractelement <128 x float> %tmp278, i32 4
+  %tmp284 = extractelement <128 x float> %tmp278, i32 5
+  %tmp285 = extractelement <128 x float> %tmp278, i32 6
+  %tmp286 = extractelement <128 x float> %tmp278, i32 7
+  %tmp287 = extractelement <128 x float> %tmp278, i32 8
+  %tmp288 = extractelement <128 x float> %tmp278, i32 9
+  %tmp289 = extractelement <128 x float> %tmp278, i32 10
+  %tmp290 = extractelement <128 x float> %tmp278, i32 11
+  %tmp291 = extractelement <128 x float> %tmp278, i32 12
+  %tmp292 = extractelement <128 x float> %tmp278, i32 13
+  %tmp293 = extractelement <128 x float> %tmp278, i32 14
+  %tmp294 = extractelement <128 x float> %tmp278, i32 15
+  %tmp295 = extractelement <128 x float> %tmp278, i32 16
+  %tmp296 = extractelement <128 x float> %tmp278, i32 17
+  %tmp297 = extractelement <128 x float> %tmp278, i32 18
+  %tmp298 = extractelement <128 x float> %tmp278, i32 19
+  %tmp299 = extractelement <128 x float> %tmp278, i32 20
+  %tmp300 = extractelement <128 x float> %tmp278, i32 21
+  %tmp301 = extractelement <128 x float> %tmp278, i32 22
+  %tmp302 = extractelement <128 x float> %tmp278, i32 23
+  %tmp303 = extractelement <128 x float> %tmp278, i32 24
+  %tmp304 = extractelement <128 x float> %tmp278, i32 25
+  %tmp305 = extractelement <128 x float> %tmp278, i32 26
+  %tmp306 = extractelement <128 x float> %tmp278, i32 27
+  %tmp307 = extractelement <128 x float> %tmp278, i32 28
+  %tmp308 = extractelement <128 x float> %tmp278, i32 29
+  %tmp309 = extractelement <128 x float> %tmp278, i32 30
+  %tmp310 = extractelement <128 x float> %tmp278, i32 31
+  %tmp311 = extractelement <128 x float> %tmp278, i32 32
+  %tmp312 = extractelement <128 x float> %tmp278, i32 33
+  %tmp313 = extractelement <128 x float> %tmp278, i32 34
+  %tmp314 = extractelement <128 x float> %tmp278, i32 35
+  %tmp315 = extractelement <128 x float> %tmp278, i32 36
+  %tmp316 = extractelement <128 x float> %tmp278, i32 37
+  %tmp317 = extractelement <128 x float> %tmp278, i32 38
+  %tmp318 = extractelement <128 x float> %tmp278, i32 39
+  %tmp319 = extractelement <128 x float> %tmp278, i32 40
+  %tmp320 = extractelement <128 x float> %tmp278, i32 41
+  %tmp321 = extractelement <128 x float> %tmp278, i32 42
+  %tmp322 = extractelement <128 x float> %tmp278, i32 43
+  %tmp323 = extractelement <128 x float> %tmp278, i32 44
+  %tmp324 = extractelement <128 x float> %tmp278, i32 45
+  %tmp325 = extractelement <128 x float> %tmp278, i32 46
+  %tmp326 = extractelement <128 x float> %tmp278, i32 47
+  %tmp327 = extractelement <128 x float> %tmp278, i32 48
+  %tmp328 = extractelement <128 x float> %tmp278, i32 49
+  %tmp329 = extractelement <128 x float> %tmp278, i32 50
+  %tmp330 = extractelement <128 x float> %tmp278, i32 51
+  %tmp331 = extractelement <128 x float> %tmp278, i32 52
+  %tmp332 = extractelement <128 x float> %tmp278, i32 53
+  %tmp333 = extractelement <128 x float> %tmp278, i32 54
+  %tmp334 = extractelement <128 x float> %tmp278, i32 55
+  %tmp335 = extractelement <128 x float> %tmp278, i32 56
+  %tmp336 = extractelement <128 x float> %tmp278, i32 57
+  %tmp337 = extractelement <128 x float> %tmp278, i32 58
+  %tmp338 = extractelement <128 x float> %tmp278, i32 59
+  %tmp339 = extractelement <128 x float> %tmp278, i32 60
+  %tmp340 = extractelement <128 x float> %tmp278, i32 61
+  %tmp341 = extractelement <128 x float> %tmp278, i32 62
+  %tmp342 = extractelement <128 x float> %tmp278, i32 63
+  %tmp343 = extractelement <128 x float> %tmp278, i32 64
+  %tmp344 = extractelement <128 x float> %tmp278, i32 65
+  %tmp345 = extractelement <128 x float> %tmp278, i32 66
+  %tmp346 = extractelement <128 x float> %tmp278, i32 67
+  %tmp347 = extractelement <128 x float> %tmp278, i32 68
+  %tmp348 = extractelement <128 x float> %tmp278, i32 69
+  %tmp349 = extractelement <128 x float> %tmp278, i32 70
+  %tmp350 = extractelement <128 x float> %tmp278, i32 71
+  %tmp351 = extractelement <128 x float> %tmp278, i32 72
+  %tmp352 = extractelement <128 x float> %tmp278, i32 73
+  %tmp353 = extractelement <128 x float> %tmp278, i32 74
+  %tmp354 = extractelement <128 x float> %tmp278, i32 75
+  %tmp355 = extractelement <128 x float> %tmp278, i32 76
+  %tmp356 = extractelement <128 x float> %tmp278, i32 77
+  %tmp357 = extractelement <128 x float> %tmp278, i32 78
+  %tmp358 = extractelement <128 x float> %tmp278, i32 79
+  %tmp359 = extractelement <128 x float> %tmp278, i32 80
+  %tmp360 = extractelement <128 x float> %tmp278, i32 81
+  %tmp361 = extractelement <128 x float> %tmp278, i32 82
+  %tmp362 = extractelement <128 x float> %tmp278, i32 83
+  %tmp363 = extractelement <128 x float> %tmp278, i32 84
+  %tmp364 = extractelement <128 x float> %tmp278, i32 85
+  %tmp365 = extractelement <128 x float> %tmp278, i32 86
+  %tmp366 = extractelement <128 x float> %tmp278, i32 87
+  %tmp367 = extractelement <128 x float> %tmp278, i32 88
+  %tmp368 = extractelement <128 x float> %tmp278, i32 89
+  %tmp369 = extractelement <128 x float> %tmp278, i32 90
+  %tmp370 = extractelement <128 x float> %tmp278, i32 91
+  %tmp371 = extractelement <128 x float> %tmp278, i32 92
+  %tmp372 = extractelement <128 x float> %tmp278, i32 93
+  %tmp373 = extractelement <128 x float> %tmp278, i32 94
+  %tmp374 = extractelement <128 x float> %tmp278, i32 95
+  %tmp375 = extractelement <128 x float> %tmp278, i32 96
+  %tmp376 = extractelement <128 x float> %tmp278, i32 97
+  %tmp377 = extractelement <128 x float> %tmp278, i32 98
+  %tmp378 = extractelement <128 x float> %tmp278, i32 99
+  %tmp379 = extractelement <128 x float> %tmp278, i32 100
+  %tmp380 = extractelement <128 x float> %tmp278, i32 101
+  %tmp381 = extractelement <128 x float> %tmp278, i32 102
+  %tmp382 = extractelement <128 x float> %tmp278, i32 103
+  %tmp383 = extractelement <128 x float> %tmp278, i32 104
+  %tmp384 = extractelement <128 x float> %tmp278, i32 105
+  %tmp385 = extractelement <128 x float> %tmp278, i32 106
+  %tmp386 = extractelement <128 x float> %tmp278, i32 107
+  %tmp387 = extractelement <128 x float> %tmp278, i32 108
+  %tmp388 = extractelement <128 x float> %tmp278, i32 109
+  %tmp389 = extractelement <128 x float> %tmp278, i32 110
+  %tmp390 = extractelement <128 x float> %tmp278, i32 111
+  %tmp391 = extractelement <128 x float> %tmp278, i32 112
+  %tmp392 = extractelement <128 x float> %tmp278, i32 113
+  %tmp393 = extractelement <128 x float> %tmp278, i32 114
+  %tmp394 = extractelement <128 x float> %tmp278, i32 115
+  %tmp395 = extractelement <128 x float> %tmp278, i32 116
+  %tmp396 = extractelement <128 x float> %tmp278, i32 117
+  %tmp397 = extractelement <128 x float> %tmp278, i32 118
+  %tmp398 = extractelement <128 x float> %tmp278, i32 119
+  %tmp399 = extractelement <128 x float> %tmp278, i32 120
+  %tmp400 = extractelement <128 x float> %tmp278, i32 121
+  %tmp401 = extractelement <128 x float> %tmp278, i32 122
+  %tmp402 = extractelement <128 x float> %tmp278, i32 123
+  %tmp403 = extractelement <128 x float> %tmp278, i32 124
+  %tmp404 = extractelement <128 x float> %tmp278, i32 125
+  %tmp405 = extractelement <128 x float> %tmp278, i32 126
+  %tmp406 = extractelement <128 x float> %tmp278, i32 127
+  %tmp407 = bitcast float %tmp95 to i32
+  %tmp408 = add i32 %tmp407, 1
+  %tmp409 = bitcast i32 %tmp408 to float
+  br label %bb12
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 5ce65371b01c..16abb89bb0b8 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,7 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s
-
-; FIXME: Enable -verify-instructions
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -11,9 +9,19 @@
 
 ; FIXME: The same register is initialized to 0 for every spill.
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: NumVgprs: 256
-; CHECK: ScratchSize: 1024
+; GCN-LABEL: {{^}}main:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+; s12 is offset user SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
 
 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index 4328e964c1bf..a704a23b0f92 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -7,9 +9,26 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].X
 
-; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; HSA: .amd_kernel_code_t
+
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; HSA: .end_amd_kernel_code_t
+
+
+; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -21,10 +40,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -36,10 +55,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].Z
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -51,10 +70,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].W
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -66,10 +85,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[1].X
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -81,10 +100,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[1].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -92,74 +111,33 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; The tgid values are stored in sgprs offset by the number of user sgprs.
-; Currently we always use exactly 2 user sgprs for the pointer to the
-; kernel arguments, but this may change in the future.
+; The tgid values are stored in sgprs offset by the number of user
+; sgprs.
 
 ; FUNC-LABEL: {{^}}tgid_x:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_x (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -167,9 +145,25 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_y (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -177,36 +171,77 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_z (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_x:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
 ; GCN: buffer_store_dword v0
-define void @tidig_x (i32 addrspace(1)* %out) {
+define void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_y:
+
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
 ; GCN: buffer_store_dword v1
-define void @tidig_y (i32 addrspace(1)* %out) {
+define void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_z:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
 ; GCN: buffer_store_dword v2
-define void @tidig_z (i32 addrspace(1)* %out) {
+define void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0
 declare i32 @llvm.r600.read.global.size.y() #0
 declare i32 @llvm.r600.read.global.size.z() #0
 
-declare i32 @llvm.r600.read.local.size.x() #0
-declare i32 @llvm.r600.read.local.size.y() #0
-declare i32 @llvm.r600.read.local.size.z() #0
-
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
 declare i32 @llvm.r600.read.tgid.z() #0
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 7510d6ccdc33..573cd45c0825 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -208,10 +208,16 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK-NOT: strexd
+; CHECK: clrex
+; CHECK-NOT: strexd
 ; CHECK: dmb {{ish$}}
 
 ; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB-NOT: strexd
+; CHECK-THUMB: clrex
+; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: dmb {{ish$}}
 
   %r = load atomic i64, i64* %ptr seq_cst, align 8
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 11aa1950b879..b80191d76012 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -96,6 +96,9 @@
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -129,6 +132,8 @@
 ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv8a (AArch32)
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
@@ -1113,6 +1118,36 @@
 ; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-R7-FAST:  .eabi_attribute 23, 1
 
+; CORTEX-A35:  .cpu cortex-a35
+; CORTEX-A35:  .eabi_attribute 6, 14
+; CORTEX-A35:  .eabi_attribute 7, 65
+; CORTEX-A35:  .eabi_attribute 8, 1
+; CORTEX-A35:  .eabi_attribute 9, 2
+; CORTEX-A35:  .fpu crypto-neon-fp-armv8
+; CORTEX-A35:  .eabi_attribute 12, 3
+; CORTEX-A35-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A35:  .eabi_attribute 20, 1
+; CORTEX-A35:  .eabi_attribute 21, 1
+; CORTEX-A35-NOT:  .eabi_attribute 22
+; CORTEX-A35:  .eabi_attribute 23, 3
+; CORTEX-A35:  .eabi_attribute 24, 1
+; CORTEX-A35:  .eabi_attribute 25, 1
+; CORTEX-A35-NOT:  .eabi_attribute 27
+; CORTEX-A35-NOT:  .eabi_attribute 28
+; CORTEX-A35:  .eabi_attribute 36, 1
+; CORTEX-A35:  .eabi_attribute 38, 1
+; CORTEX-A35:  .eabi_attribute 42, 1
+; CORTEX-A35-NOT:  .eabi_attribute 44
+; CORTEX-A35:  .eabi_attribute 68, 3
+
+; CORTEX-A35-FAST-NOT:   .eabi_attribute 19
+;; The A35 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A35-FAST:  .eabi_attribute 20, 2
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A35-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
 ; CORTEX-A53:  .eabi_attribute 7, 65
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index e17da7a97205..a44c9721d6c1 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -14,15 +14,15 @@ entry:
   br i1 undef, label %for.end, label %for.body
 
 ; Before if conversion, we have
-; for.body -> lor.lhs.false.i (62)
-;          -> for.cond.backedge (62)
-; lor.lhs.false.i -> for.cond.backedge (1048575)
-;                 -> cond.false.i (1)
+; for.body -> lor.lhs.false.i (50%)
+;          -> for.cond.backedge (50%)
+; lor.lhs.false.i -> for.cond.backedge (100%)
+;                 -> cond.false.i (0%)
 ; Afer if conversion, we have
-; for.body -> for.cond.backedge (130023362)
-;          -> cond.false.i (62)
+; for.body -> for.cond.backedge (100%)
+;          -> cond.false.i (0%)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048)
+; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index f2a1229d0d8a..0de039cde23c 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %9, label %return, label %bb2
 
 ; CHECK: BB#2: derived from LLVM BB %bb2
-; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%)
 
 bb2:
   %v10 = icmp eq i32 %3, 16
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index 6ce9bcb56ef4..a96b6e8a1e83 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s
+; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s
 
 declare i32 @foo(i32)
 declare i8* @bar(i32, i8*, i8*)
@@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*)
 ; CHECK-NEXT: [[FOOCALL]]:
 ; CHECK-NEXT:  blx _foo
 ;
-; CHECK-WEIGHT: BB#0:
-; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912)
-; CHECK-WEIGHT: BB#1:
-; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912)
+; CHECK-PROB: BB#0:
+; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
+; CHECK-PROB: BB#1:
+; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
 
 define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) {
 entry:
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index 95b0a202e7ff..f83f28815793 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -9,7 +9,7 @@
 ;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
 
 ; CHECK: # Machine code for function test0:
-; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%)
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: # End machine code for function test0.
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 576c120b444e..799ef62416e6 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -3,7 +3,7 @@
 ; RUN:	| FileCheck %s
 
 ; CHECK: Machine code for function test0:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
 entry:
@@ -30,7 +30,7 @@ B4:
 !0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 @g0 = common global i32 0, align 4
 
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
index 5b9c19ab5eb2..67d1cad2cf68 100644
--- a/test/CodeGen/ARM/thumb1_return_sequence.ll
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -23,9 +23,22 @@ entry:
 ; --------
 ; CHECK-V4T:         add sp,
 ; CHECK-V4T-NEXT:    pop {[[SAVED]]}
-; We do not have any SP update to insert so we can just optimize
-; the pop sequence.
-; CHECK-V4T-NEXT:    pop {pc}
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; We may only use lo register to pop, but in that case, all the scratch
+; ones are used.
+; r12 is the only register we are allowed to clobber for AAPCS.
+; Use it to save a lo register.
+; CHECK-V4T-NEXT:    mov [[TEMP_REG:r12]], [[POP_REG:r[0-7]]]
+; Pop the value of LR.
+; CHECK-V4T-NEXT:    pop {[[POP_REG]]}
+; Copy the value of LR in the right register.
+; CHECK-V4T-NEXT:    mov lr, [[POP_REG]]
+; Restore the value that was in the register we used to pop the value of LR.
+; CHECK-V4T-NEXT:    mov [[POP_REG]], [[TEMP_REG]]
+; Return.
+; CHECK-V4T-NEXT:    bx lr
 ; CHECK-V5T:         pop {[[SAVED]], pc}
 }
 
@@ -93,7 +106,13 @@ entry:
 ; Epilogue
 ; --------
 ; CHECK-V4T:    pop {[[SAVED]]}
-; CHECK-V4T:    pop {pc}
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; Pop the value of LR into a scratch lo register other than r0 (it is
+; used for the return value).
+; CHECK-V4T-NEXT:    pop {[[POP_REG:r[1-3]]]}
+; CHECK-V4T-NEXT:    bx [[POP_REG]]
 ; CHECK-V5T:    pop {[[SAVED]], pc}
 }
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 5a4a4672f7eb..ae3c8da21471 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -16,11 +16,11 @@ entry:
     i64 5, label %sw.bb1
   ], !prof !0
 ; CHECK: BB#0: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%)
 
 sw.bb:
   br label %return
@@ -62,7 +62,7 @@ return: ret void
 ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK-NOT: Successors
-; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%)
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
index f84fd95e4fbd..341567e1d02f 100644
--- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
+++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
@@ -2,7 +2,7 @@
 ; Check that the edge weights are updated correctly after if-conversion.
 
 ; CHECK: BB#3:
-; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%)
 @a = external global i32
 @d = external global i32
 
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
index b5ed3b7f27e1..bce06d540114 100644
--- a/test/CodeGen/MIR/X86/newline-handling.mir
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -35,7 +35,7 @@ liveins:
 # CHECK-LABEL: name: foo
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
@@ -79,7 +79,7 @@ liveins:
 # CHECK-LABEL: name: bar
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
index fc5e5d640f7f..64af6121189a 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -1,6 +1,6 @@
 # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
 # This test ensures that the MIR parser parses basic block successors and
-# weights correctly.
+# probabilities correctly.
 
 --- |
 
@@ -21,10 +21,10 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(16), %bb.2.exit(32)
+  ; CHECK:         successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
-    successors: %bb.1.less (16), %bb.2.exit(32)
+    successors: %bb.1.less (33), %bb.2.exit(67)
     liveins: %edi
 
     CMP32ri8 %edi, 10, implicit-def %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
index aa80fe9fbeef..a6c14f70bc7c 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -32,7 +32,7 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(0), %bb.2.exit(0)
+  ; CHECK:         successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
     successors: %bb.1.less, %bb.2.exit
@@ -58,7 +58,7 @@ body: |
   ; Verify that we can have multiple lists of successors that will be merged
   ; into one.
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1(0), %bb.2(0)
+  ; CHECK:         successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%)
   bb.0.entry:
     liveins: %edi
     successors: %bb.1
diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index 58dd16c9f9c8..54092b4e3ebe 100644
--- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,5 +1,5 @@
 ; Check that register scavenging spill slot is close to $fp.
-; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s
 
 ; CHECK: sw ${{.*}}, 8($sp)
 ; CHECK: lw ${{.*}}, 8($sp)
@@ -31,4 +31,4 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" }
+attributes #0 = { noinline "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
new file mode 100644
index 000000000000..f0c0deacf4dd
--- /dev/null
+++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -0,0 +1,305 @@
+; RUN: llc -O2 < %s | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Function Attrs: nounwind
+define void @_ZN10SubProcess19ScrubbedForkAndExecEiPiS0_PNS_7ResultsE() #0 align 2 {
+; CHECK: lis 3, 1234
+; CHECK-NOT: li 3
+; CHECK-NOT: ori 3
+; CHECK-NOT: addi 3
+; CHECK-NOT: addis 3
+; CHECK-NOT: lis 3
+; CHECK: sc
+  br i1 undef, label %1, label %2
+
+; <label>:1                                       ; preds = %0
+  br label %60
+
+; <label>:2                                       ; preds = %0
+  br i1 undef, label %3, label %4
+
+; <label>:3                                       ; preds = %2
+  unreachable
+
+; <label>:4                                       ; preds = %2
+  br i1 undef, label %.lr.ph111, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+.lr.ph111:                                        ; preds = %4
+  br label %5
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit: ; preds = %12, %4
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %13
+
+; <label>:5                                       ; preds = %12, %.lr.ph111
+  br i1 undef, label %6, label %9
+
+; <label>:6                                       ; preds = %5
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  unreachable
+
+; <label>:8                                       ; preds = %6
+  br label %12
+
+; <label>:9                                       ; preds = %5
+  br i1 undef, label %10, label %11
+
+; <label>:10                                      ; preds = %9
+  br label %12
+
+; <label>:11                                      ; preds = %9
+  br label %12
+
+; <label>:12                                      ; preds = %11, %10, %8
+  br i1 undef, label %5, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+; <label>:13                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %14
+
+; <label>:14                                      ; preds = %13
+  br label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader: ; preds = %14, %13, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br i1 undef, label %15, label %19
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit:       ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit, %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19: ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+
+; <label>:15                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %16
+
+; <label>:16                                      ; preds = %17, %15
+  br i1 undef, label %17, label %.critedge.preheader
+
+; <label>:17                                      ; preds = %16
+  br i1 undef, label %16, label %.critedge.preheader
+
+.critedge.preheader:                              ; preds = %17, %16
+  br label %.critedge
+
+.critedge:                                        ; preds = %18, %.critedge.preheader
+  br i1 undef, label %18, label %.critedge8
+
+; <label>:18                                      ; preds = %.critedge
+  br i1 undef, label %.critedge, label %.critedge8
+
+.critedge8:                                       ; preds = %18, %.critedge
+  br label %59
+
+; <label>:19                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit22:     ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, %19
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, label %20
+
+; <label>:20                                      ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+  %21 = alloca i8, i64 undef, align 1
+  br label %.thread.outer
+
+.thread.outer:                                    ; preds = %._crit_edge, %20
+  br label %.thread
+
+.thread:                                          ; preds = %45, %.thread.outer
+  call void @llvm.memset.p0i8.i64(i8* undef, i8 0, i64 56, i32 8, i1 false)
+  store i8* %21, i8** undef, align 8
+  store i32 1073741824, i32* undef, align 8
+  %22 = call { i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "sc\0A\09mfcr $0", "=&{r0},=&{r3},=&{r4},=&{r5},=&{r6},=&{r7},=&{r8},{r0},{r3},{r4},{r5},~{cr0},~{ctr},~{memory},~{r11},~{r12}"(i64 342, i64 80871424, i64 undef, i64 0) #2, !srcloc !1
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge15.preheader:                            ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  br i1 undef, label %.lr.ph93.preheader, label %.critedge15._crit_edge
+
+.lr.ph93.preheader:                               ; preds = %.critedge15.preheader
+  br label %.lr.ph93
+
+.lr.ph:                                           ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  switch i32 undef, label %.critedge9 [
+    i32 11, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+    i32 4, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+  ]
+
+_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge: ; preds = %.lr.ph, %.lr.ph
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge9:                                       ; preds = %.lr.ph
+  unreachable
+
+.critedge15._crit_edge:                           ; preds = %.critedge15, %.critedge15.preheader
+  br i1 undef, label %35, label %34
+
+.lr.ph93:                                         ; preds = %.critedge15, %.lr.ph93.preheader
+  switch i32 undef, label %33 [
+    i32 0, label %23
+    i32 1, label %23
+    i32 2, label %23
+    i32 3, label %23
+    i32 4, label %23
+    i32 5, label %23
+    i32 6, label %23
+    i32 7, label %23
+    i32 8, label %27
+    i32 9, label %30
+  ]
+
+; <label>:23                                      ; preds = %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93
+  br i1 undef, label %24, label %.critedge15
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %.critedge15, label %25
+
+; <label>:25                                      ; preds = %24
+  br i1 undef, label %.critedge15, label %26
+
+; <label>:26                                      ; preds = %25
+  unreachable
+
+; <label>:27                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %28
+
+; <label>:28                                      ; preds = %27
+  br i1 undef, label %29, label %.critedge15
+
+; <label>:29                                      ; preds = %28
+  br label %.critedge15
+
+; <label>:30                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %31
+
+; <label>:31                                      ; preds = %30
+  br i1 undef, label %32, label %.critedge15
+
+; <label>:32                                      ; preds = %31
+  br label %.critedge15
+
+; <label>:33                                      ; preds = %.lr.ph93
+  unreachable
+
+.critedge15:                                      ; preds = %32, %31, %30, %29, %28, %27, %25, %24, %23
+  br i1 undef, label %.lr.ph93, label %.critedge15._crit_edge
+
+; <label>:34                                      ; preds = %.critedge15._crit_edge
+  unreachable
+
+; <label>:35                                      ; preds = %.critedge15._crit_edge
+  br i1 undef, label %45, label %36
+
+; <label>:36                                      ; preds = %35
+  br i1 undef, label %37, label %38
+
+; <label>:37                                      ; preds = %36
+  br i1 undef, label %.preheader, label %38
+
+.preheader:                                       ; preds = %37
+  br i1 undef, label %.lr.ph101, label %._crit_edge
+
+.lr.ph101:                                        ; preds = %.preheader
+  br label %39
+
+; <label>:38                                      ; preds = %37, %36
+  unreachable
+
+; <label>:39                                      ; preds = %43, %.lr.ph101
+  br i1 undef, label %40, label %43
+
+; <label>:40                                      ; preds = %39
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17, label %41
+
+; <label>:41                                      ; preds = %40
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17:   ; preds = %40
+  br i1 undef, label %42, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+
+; <label>:42                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit:     ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  br i1 undef, label %.thread27, label %43
+
+; <label>:43                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit, %39
+  br i1 undef, label %39, label %._crit_edge
+
+.thread27:                                        ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+  br label %58
+
+._crit_edge:                                      ; preds = %43, %.preheader
+  br i1 undef, label %.thread.outer, label %44
+
+; <label>:44                                      ; preds = %._crit_edge
+  unreachable
+
+; <label>:45                                      ; preds = %35
+  br i1 undef, label %46, label %.thread
+
+; <label>:46                                      ; preds = %45
+  br i1 undef, label %48, label %47
+
+; <label>:47                                      ; preds = %46
+  unreachable
+
+; <label>:48                                      ; preds = %46
+  br i1 undef, label %55, label %49
+
+; <label>:49                                      ; preds = %48
+  br i1 undef, label %50, label %51
+
+; <label>:50                                      ; preds = %49
+  br label %52
+
+; <label>:51                                      ; preds = %49
+  br label %52
+
+; <label>:52                                      ; preds = %51, %50
+  br label %53
+
+; <label>:53                                      ; preds = %54, %52
+  br i1 undef, label %54, label %.critedge13
+
+; <label>:54                                      ; preds = %53
+  br i1 undef, label %53, label %.critedge13
+
+.critedge13:                                      ; preds = %54, %53
+  br label %58
+
+; <label>:55                                      ; preds = %48
+  br label %56
+
+; <label>:56                                      ; preds = %57, %55
+  br i1 undef, label %57, label %.critedge14
+
+; <label>:57                                      ; preds = %56
+  br i1 undef, label %56, label %.critedge14
+
+.critedge14:                                      ; preds = %57, %56
+  br label %58
+
+; <label>:58                                      ; preds = %.critedge14, %.critedge13, %.thread27
+  br label %59
+
+; <label>:59                                      ; preds = %58, %.critedge8
+  br label %60
+
+; <label>:60                                      ; preds = %59, %1
+  ret void
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind argmemonly }
+attributes #2 = { nounwind }
+
+!1 = !{i32 -2140527538, i32 -2140527533}
diff --git a/test/CodeGen/PowerPC/dyn-alloca-offset.ll b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
new file mode 100644
index 000000000000..7159b9da736d
--- /dev/null
+++ b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare i64 @llvm.get.dynamic.area.offset.i64()
+
+declare i64 @bar(i64)
+
+attributes #0 = { nounwind }
+
+; Function Attrs: nounwind sanitize_address uwtable
+define signext i64 @foo(i32 signext %N, i32 signext %M) #0 {
+  %1 = alloca i64, align 32
+  %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+  %2 = call i64 @bar(i64 %dynamic_area_offset)
+  ret i64 %2
+
+; CHECK-DAG: li [[REG1:[0-9]+]], 112
+; CHECK: blr
+
+}
diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
index a23888425bf5..2da8e8d0984c 100644
--- a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
+++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false |  FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; XFAIL: *
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
diff --git a/test/CodeGen/PowerPC/rotl-rotr-crash.ll b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
new file mode 100644
index 000000000000..3fbb67ecf25e
--- /dev/null
+++ b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8
+
+; Ensure this does not crash
+
+; Function Attrs: norecurse nounwind
+define <4 x i32> @func1 (<4 x i32> %a) {
+entry:
+  %0 = lshr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %1 = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %2 = or <4 x i32> %1, %0
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/Thumb/pop-special-fixup.ll b/test/CodeGen/Thumb/pop-special-fixup.ll
new file mode 100644
index 000000000000..9ba589d6cec3
--- /dev/null
+++ b/test/CodeGen/Thumb/pop-special-fixup.ll
@@ -0,0 +1,60 @@
+; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s
+
+target triple = "thumbv6m-none-none-eabi"
+
+@retval = global i32 0, align 4
+
+define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) {
+  %1 = icmp sgt i32 %argc, %i
+  br i1 %1, label %2, label %19
+
+  %3 = getelementptr inbounds i8*, i8** %argv, i32 %i
+  %4 = load i8*, i8** %3, align 4
+  %5 = load i8, i8* %4, align 1
+  %6 = icmp eq i8 %5, 45
+  %7 = getelementptr inbounds i8, i8* %4, i32 1
+  %. = select i1 %6, i8* %7, i8* %4
+  %.1 = select i1 %6, i32 -1, i32 1
+  %8 = load i8, i8* %., align 1
+  %.off2 = add i8 %8, -48
+  %9 = icmp ult i8 %.off2, 10
+  %.pre = load i32, i32* @retval, align 4
+  br i1 %9, label %.lr.ph.preheader, label %.critedge
+
+.lr.ph.preheader:                                 ; preds = %2
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ]
+  %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ]
+  %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ]
+  %12 = zext i8 %11 to i32
+  %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1
+  %14 = add nsw i32 %10, %12
+  store i32 %14, i32* @retval, align 4
+  %15 = load i8, i8* %13, align 1
+  %.off = add i8 %15, -48
+  %16 = icmp ult i8 %.off, 10
+  br i1 %16, label %.lr.ph, label %.critedge.loopexit
+
+.critedge.loopexit:                               ; preds = %.lr.ph
+  %.lcssa = phi i32 [ %14, %.lr.ph ]
+  br label %.critedge
+
+.critedge:                                        ; preds = %.critedge.loopexit, %2
+  %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ]
+  %18 = mul nsw i32 %17, %.1
+  store i32 %18, i32* @retval, align 4
+  br label %19
+
+; <label>:19                                      ; preds = %.critedge, %0
+  ret i32 0
+}
+
+; CHECK: push {r4, r5, r7, lr}
+; CHECK: pop {r4, r5, r7}
+; CHECK: pop {r0}
+; CHECK: mov lr, r0
+; CHECK: movs r0, #0
+; CHECK: bx  lr
+
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
new file mode 100644
index 000000000000..1bcee5d31fb7
--- /dev/null
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the wasm-store-results pass makes users of stored values use the
+; result of store expressions to reduce get_local/set_local traffic.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: single_block:
+; CHECK-NOT: .local
+; CHECK: i32.const $push{{[0-9]+}}=, 0
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, $0, $pop{{[0-9]+}}
+; CHECK: return $pop[[STORE]]{{$}}
+define i32 @single_block(i32* %p) {
+entry:
+  store i32 0, i32* %p
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index da0bf517ecfa..ee1c658d4c55 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -18,9 +18,9 @@ for.cond2:                                        ; preds = %for.inc, %for.cond
   %or.cond = or i1 %tobool, %cmp4
   br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
 ; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
 ; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
 
 for.inc:                                          ; preds = %for.cond2
   %shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index ce2bf0fdad19..f1c636a73305 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -1,24 +1,31 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
 
 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: avg_v4i8
-; SSE2:      # BB#0:
-; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd (%rsi), %xmm1           # xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-LABEL: avg_v4i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8
+; AVX2-LABEL: avg_v4i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vmovd (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = load <4 x i8>, <4 x i8>* %b
   %3 = zext <4 x i8> %1 to <4 x i32>
@@ -32,22 +39,29 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
 }
 
 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
-; SSE2-LABEL: avg_v8i8
+; SSE2-LABEL: avg_v8i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8
+; AVX2-LABEL: avg_v8i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = load <8 x i8>, <8 x i8>* %b
   %3 = zext <8 x i8> %1 to <8 x i32>
@@ -61,20 +75,19 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
 }
 
 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
-; SSE2-LABEL: avg_v16i8
+; SSE2-LABEL: avg_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: pavgb (%rdi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgb (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = load <16 x i8>, <16 x i8>* %b
   %3 = zext <16 x i8> %1 to <16 x i32>
@@ -88,12 +101,20 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
 }
 
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
-; AVX2-LABEL: avg_v32i8
+; AVX2-LABEL: avg_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %ymm0
-; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
   %3 = zext <32 x i8> %1 to <32 x i32>
@@ -107,13 +128,12 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 }
 
 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
-; AVX512BW-LABEL: avg_v64i8
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
   %3 = zext <64 x i8> %1 to <64 x i32>
@@ -127,22 +147,29 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 }
 
 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
-; SSE2-LABEL: avg_v4i16
+; SSE2-LABEL: avg_v4i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgw %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-LABEL: avg_v4i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = load <4 x i16>, <4 x i16>* %b
   %3 = zext <4 x i16> %1 to <4 x i32>
@@ -156,20 +183,19 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
 }
 
 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
-; SSE2-LABEL: avg_v8i16
+; SSE2-LABEL: avg_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: pavgw (%rdi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v8i16
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgw (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = load <8 x i16>, <8 x i16>* %b
   %3 = zext <8 x i16> %1 to <8 x i32>
@@ -183,12 +209,20 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
 }
 
 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
-; AVX2-LABEL: avg_v16i16
+; AVX2-LABEL: avg_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %ymm0
-; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
   %3 = zext <16 x i16> %1 to <16 x i32>
@@ -202,13 +236,12 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 }
 
 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
-; AVX512BW-LABEL: avg_v32i16
-; AVX512BW:      # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
   %3 = zext <32 x i16> %1 to <32 x i32>
@@ -222,22 +255,29 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 }
 
 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: avg_v4i8_2
+; SSE2-LABEL: avg_v4i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  movd (%rdi), %xmm0
-; SSE2-NEXT:  movd (%rsi), %xmm1
-; SSE2-NEXT:  pavgb %xmm0, %xmm1
-; SSE2-NEXT:  movd %xmm1, (%rax)
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8_2
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vmovd (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-LABEL: avg_v4i8_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = load <4 x i8>, <4 x i8>* %b
   %3 = zext <4 x i8> %1 to <4 x i32>
@@ -251,22 +291,29 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
 }
 
 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
-; SSE2-LABEL: avg_v8i8_2
+; SSE2-LABEL: avg_v8i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8_2
+; AVX2-LABEL: avg_v8i8_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = load <8 x i8>, <8 x i8>* %b
   %3 = zext <8 x i8> %1 to <8 x i32>
@@ -280,20 +327,19 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
 }
 
 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
-; SSE2-LABEL: avg_v16i8_2
+; SSE2-LABEL: avg_v16i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgb (%rsi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8_2
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = load <16 x i8>, <16 x i8>* %b
   %3 = zext <16 x i8> %1 to <16 x i32>
@@ -307,12 +353,20 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
 }
 
 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
-; AVX2-LABEL: avg_v32i8_2
+; AVX2-LABEL: avg_v32i8_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
   %3 = zext <32 x i8> %1 to <32 x i32>
@@ -326,13 +380,12 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 }
 
 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
-; AVX512BW-LABEL: avg_v64i8_2
+; AVX512BW-LABEL: avg_v64i8_2:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
   %3 = zext <64 x i8> %1 to <64 x i32>
@@ -347,22 +400,29 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 
 
 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
-; SSE2-LABEL: avg_v4i16_2
+; SSE2-LABEL: avg_v4i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgw %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16_2
+; AVX2-LABEL: avg_v4i16_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = load <4 x i16>, <4 x i16>* %b
   %3 = zext <4 x i16> %1 to <4 x i32>
@@ -376,20 +436,19 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
 }
 
 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
-; SSE2-LABEL: avg_v8i16_2
+; SSE2-LABEL: avg_v8i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  movdqa (%rdi), %xmm0
-; SSE2-NEXT:  pavgw (%rsi), %xmm0
-; SSE2-NEXT:  movdqu %xmm0, (%rax)
-; SSE2-NEXT:  retq
-;
-; AVX2-LABEL: avg_v8i16_2
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = load <8 x i16>, <8 x i16>* %b
   %3 = zext <8 x i16> %1 to <8 x i32>
@@ -403,12 +462,20 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
 }
 
 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
-; AVX2-LABEL: avg_v16i16_2
+; AVX2-LABEL: avg_v16i16_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
   %3 = zext <16 x i16> %1 to <16 x i32>
@@ -422,13 +489,12 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 }
 
 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
-; AVX512BW-LABEL: avg_v32i16_2
+; AVX512BW-LABEL: avg_v32i16_2:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
   %3 = zext <32 x i16> %1 to <32 x i32>
@@ -442,20 +508,26 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 }
 
 define void @avg_v4i8_const(<4 x i8>* %a) {
-; SSE2-LABEL: avg_v4i8_const
+; SSE2-LABEL: avg_v4i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movd %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8_const
+; AVX2-LABEL: avg_v4i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
@@ -466,20 +538,26 @@ define void @avg_v4i8_const(<4 x i8>* %a) {
 }
 
 define void @avg_v8i8_const(<8 x i8>* %a) {
-; SSE2-LABEL: avg_v8i8_const
+; SSE2-LABEL: avg_v8i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8_const
+; AVX2-LABEL: avg_v8i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = zext <8 x i8> %1 to <8 x i32>
   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -490,20 +568,19 @@ define void @avg_v8i8_const(<8 x i8>* %a) {
 }
 
 define void @avg_v16i8_const(<16 x i8>* %a) {
-; SSE2-LABEL: avg_v16i8_const
+; SSE2-LABEL: avg_v16i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8_const
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = zext <16 x i8> %1 to <16 x i32>
   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -514,53 +591,66 @@ define void @avg_v16i8_const(<16 x i8>* %a) {
 }
 
 define void @avg_v32i8_const(<32 x i8>* %a) {
-; AVX2-LABEL: avg_v32i8_const
+; AVX2-LABEL: avg_v32i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = zext <32 x i8> %1 to <32 x i32>
   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <32 x i32> %4 to <32 x i8>
   store <32 x i8> %5, <32 x i8>* undef, align 4
   ret void
 }
 
 define void @avg_v64i8_const(<64 x i8>* %a) {
-; AVX512BW-LABEL: avg_v64i8_const
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v64i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = zext <64 x i8> %1 to <64 x i32>
   %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <64 x i32> %4 to <64 x i8>
   store <64 x i8> %5, <64 x i8>* undef, align 4
   ret void
 }
 
 define void @avg_v4i16_const(<4 x i16>* %a) {
-; SSE2-LABEL: avg_v4i16_const
+; SSE2-LABEL: avg_v4i16_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0
-; SSE2-NEXT: pavgw {{.*}}, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16_const
+; AVX2-LABEL: avg_v4i16_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
@@ -571,20 +661,19 @@ define void @avg_v4i16_const(<4 x i16>* %a) {
 }
 
 define void @avg_v8i16_const(<8 x i16>* %a) {
-; SSE2-LABEL: avg_v8i16_const
+; SSE2-LABEL: avg_v8i16_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgw {{.*}}, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v8i16_const
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = zext <8 x i16> %1 to <8 x i32>
   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -595,12 +684,20 @@ define void @avg_v8i16_const(<8 x i16>* %a) {
 }
 
 define void @avg_v16i16_const(<16 x i16>* %a) {
-; AVX2-LABEL: avg_v16i16_const
+; AVX2-LABEL: avg_v16i16_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = zext <16 x i16> %1 to <16 x i32>
   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -611,16 +708,16 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
 }
 
 define void @avg_v32i16_const(<32 x i16>* %a) {
-; AVX512BW-LABEL: avg_v32i16_const
+; AVX512BW-LABEL: avg_v32i16_const:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-;
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = zext <32 x i16> %1 to <32 x i32>
   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <32 x i32> %4 to <32 x i16>
   store <32 x i16> %5, <32 x i16>* undef, align 4
   ret void
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 522abd261472..d7da77a5eb54 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=AVX512DQ %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq -mattr=+avx512bw -mattr=+avx512vl| FileCheck --check-prefix=CHECK --check-prefix=SKX %s
 
 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
 ; CHECK-LABEL: addpd512:
@@ -83,18 +87,54 @@ entry:
 }
 
 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
-; CHECK-LABEL: imulq512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm3
-; CHECK-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
-; CHECK-NEXT:    vpsllq $32, %zmm3, %zmm3
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm1, %zmm1
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512F-LABEL: imulq512:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: imulq512:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: imulq512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: imulq512:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: imulq512:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
   %z = mul <8 x i64>%x, %y
   ret <8 x i64>%z
 }
@@ -463,10 +503,13 @@ entry:
   ret <8 x i64>%d
 }
 
-; CHECK-LABEL: test_mask_vaddps
-; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vaddps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -475,10 +518,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmulps
-; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmulps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -487,10 +533,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminps
-; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vminps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -500,10 +549,41 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminpd
-; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vminpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vminpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vminpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vminpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vminpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -513,10 +593,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxps
-; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmaxps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -526,10 +609,41 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxpd
-; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vmaxpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vmaxpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vmaxpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vmaxpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vmaxpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -539,10 +653,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vsubps
-; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vsubps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -551,10 +668,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vdivps
-; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vdivps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -563,10 +683,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double> %j, <8 x i64> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -575,10 +698,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}}
-; CHECK: ret
 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
+; CHECK-LABEL: test_maskz_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %x = fadd <8 x double> %i, %j
@@ -586,10 +712,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double>* %j,  <8 x i64> %mask1)
                                      nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -599,10 +728,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
+; CHECK-LABEL: test_maskz_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load <8 x double>, <8 x double>* %j, align 8
@@ -611,10 +743,11 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}}
-; CHECK: ret
 define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+; CHECK-LABEL: test_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load double, double* %j
   %b = insertelement <8 x double> undef, double %tmp, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef,
@@ -623,10 +756,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
   ret <8 x double> %x
 }
 
-; CHECK-LABEL: test_mask_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
                                       double* %j, <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -638,10 +775,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+; CHECK-LABEL: test_maskz_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                        <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -653,10 +793,31 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_fxor
-; CHECK: vpxord
-; CHECK: ret
 define <16 x float>  @test_fxor(<16 x float> %a) {
+; AVX512F-LABEL: test_fxor:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_fxor:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_fxor:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fxor:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_fxor:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
 
   %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float>%res
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index ea0394c54854..c47027eed2b2 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6159,3 +6159,78 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
   ret <8 x double> %res4
 }
 
+define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae
+; CHECK: vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sete	%al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae
+; CHECK: vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq
+; CHECK: vcomisd %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq
+; CHECK: vucomisd %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae
+; CHECK: vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae
+; CHECK: vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt
+; CHECK: vcomisd %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt
+; CHECK: vucomisd %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) 
+
+define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt
+; CHECK: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 7b9768c95944..6b032e0e6d78 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,15 +1,24 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -17,15 +26,22 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -33,15 +49,22 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 
 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -49,326 +72,491 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
 
 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
-define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
 declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_w_512
 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
-  ; CHECK: vpblendmw
-  %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
+; AVX512BW-LABEL: test_x86_mask_blend_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+    %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_b_512
-; CHECK: vpblendmb
 define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
+; AVX512BW-LABEL: test_x86_mask_blend_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
   ret <64 x i8> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rr_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrk_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrkz_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rm_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmk_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmkz_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmb_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -377,8 +565,12 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbk_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -387,8 +579,11 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -399,45 +594,63 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rr_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrk_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrkz_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rm_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmk_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmkz_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -447,53 +660,73 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rr_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrk_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrkz_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rm_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmk_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmkz_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmb_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -502,8 +735,12 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbk_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -512,8 +749,11 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -524,45 +764,63 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rr_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrk_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrkz_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rm_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmk_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmkz_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -571,45 +829,63 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rr_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrk_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrkz_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rm_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmk_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmkz_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -618,45 +894,63 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rr_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrk_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrkz_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rm_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmk_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmkz_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -665,45 +959,63 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rr_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrk_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrkz_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rm_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmk_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmkz_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -712,45 +1024,63 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rr_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrk_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrkz_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rm_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmk_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmkz_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -760,11 +1090,14 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <3
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -773,11 +1106,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -786,11 +1122,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -799,11 +1138,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -812,11 +1154,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_512
-; CHECK-NOT: call 
-; CHECK: vpminsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -825,11 +1170,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_512
-; CHECK-NOT: call 
-; CHECK: vpminsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -838,11 +1186,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_512
-; CHECK-NOT: call 
-; CHECK: vpminub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -851,11 +1202,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_512
-; CHECK-NOT: call 
-; CHECK: vpminuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -864,11 +1218,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -877,11 +1235,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 
 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} {z}
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -890,11 +1252,15 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -903,11 +1269,14 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 
 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512
-; CHECK-NOT: call 
-; CHECK: vpavgb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -916,11 +1285,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_512
-; CHECK-NOT: call 
-; CHECK: vpavgw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -929,11 +1301,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpshufb %zmm{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -942,11 +1317,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsw{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -955,11 +1333,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsb{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -968,12 +1349,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -982,12 +1365,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -996,12 +1381,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1011,10 +1398,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; CHECK:       vpmovwb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovwb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovwb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1026,9 +1418,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; CHECK:  vpmovwb %zmm0, (%rdi)
-; CHECK:  vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1037,10 +1432,15 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; CHECK:       vpmovswb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovswb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovswb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1052,9 +1452,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; CHECK:  vpmovswb %zmm0, (%rdi)
-; CHECK:  vpmovswb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1063,10 +1466,15 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; CHECK:       vpmovuswb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovuswb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovuswb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1078,9 +1486,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; CHECK:  vpmovuswb %zmm0, (%rdi)
-; CHECK:  vpmovuswb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1089,13 +1500,13 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1
 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1105,13 +1516,13 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
 declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovw %edi, %k1
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1121,15 +1532,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; CHECK-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1139,15 +1548,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; CHECK-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1157,15 +1564,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; CHECK-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1175,15 +1580,13 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; CHECK-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1193,15 +1596,15 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
@@ -1213,15 +1616,15 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %
 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
@@ -1232,11 +1635,13 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 
 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psll_dq_512
-; CHECK-NOT: call 
-; CHECK: vpslldq 
-; CHECK: vpslldq 
 define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1245,11 +1650,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
 
 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_dq_512
-; CHECK-NOT: call 
-; CHECK: vpsrldq 
-; CHECK: vpsrldq
 define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1257,11 +1664,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
 }
 declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512
-; CHECK-NOT: call 
-; CHECK: vpsadbw %zmm1
-; CHECK: vpsadbw %zmm2
 define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   %res2 = add  <8 x i64> %res, %res1
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index 9b06f2abc81c..e8b416845ec1 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -2,7 +2,7 @@
 
 ; Check if the edge weight to the catchpad is calculated correctly.
 
-; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256)
+; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%)
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--windows-msvc18.0.0"
diff --git a/test/CodeGen/X86/coalescer-win64.ll b/test/CodeGen/X86/coalescer-win64.ll
new file mode 100644
index 000000000000..ff084ae5b9e0
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-win64.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -verify-coalescing | FileCheck %s
+target triple = "x86_64-pc-win32"
+
+@fnptr = external global void ()*
+
+define void @test1() {
+entry:
+  %p = load void ()*, void ()** @fnptr
+  tail call void %p()
+  ret void
+}
+
+; CHECK-LABEL: test1{{$}}
+; CHECK: .seh_proc test1{{$}}
+; CHECK: rex64 jmpq *fnptr(%rip)
+; CHECK: .seh_endproc
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
index 9e57b00bc0b4..9a368792133b 100644
--- a/test/CodeGen/X86/fma-commute-x86.ll
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s
 ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 942d799d9761..0f0dd20da040 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -1109,4 +1109,87 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
   ret <4 x float> %a
 }
 
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define double @test_f64_fneg_fmul(double %x, double %y) #0 {
+; FMA-LABEL: test_f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz double %x, %y
+  %n = fsub double -0.0, %m
+  ret double %n
+}
+
+define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
+; FMA-LABEL: test_v4f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x float> %x, %y
+  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <4 x float> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
+; FMA-LABEL: test_v4f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
+; ALL-LABEL: test_v4f64_fneg_fmul_no_nsz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vxorpd {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %m = fmul <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index de77a27ad2b6..7b6509ad51c7 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -737,4 +737,85 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
   ret <16 x float> %a
 }
 
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+; FMA-LABEL: test_v16f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <16 x float> %x, %y
+  %n = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <16 x float> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vxorpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
new file mode 100644
index 000000000000..748c397143c5
--- /dev/null
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -0,0 +1,113 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+@a = common global i32 0, align 4
+@d = internal unnamed_addr global i1 false
+@b = common global i32 0, align 4
+@e = common global i8 0, align 1
+@f = common global i8 0, align 1
+@c = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+
+; Check that we are clobbering the flags when they are live-in of the
+; prologue block and the prologue needs to adjust the stack.
+; PR25607.
+;
+; CHECK-LABEL: eflagsLiveInPrologue:
+;
+; DISABLE: pushl
+; DISABLE-NEXT: pushl
+; DISABLE-NEXT: subl $20, %esp
+;
+; CHECK: movl L_a$non_lazy_ptr, [[A:%[a-z]+]]
+; CHECK-NEXT: cmpl $0, ([[A]])
+; CHECK-NEXT: je [[PREHEADER_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $1, _d
+;
+; CHECK: [[PREHEADER_LABEL]]:
+; CHECK-NEXT: movl L_b$non_lazy_ptr, [[B:%[a-z]+]]
+; CHECK-NEXT: movl ([[B]]), [[TMP1:%[a-z]+]]
+; CHECK-NEXT: testl [[TMP1]], [[TMP1]]
+; CHECK-NEXT: je  [[FOREND_LABEL:LBB[0-9_]+]]
+;
+; Skip the loop.
+; [...]
+;
+; The for.end block is split to accomadate the different selects.
+; We are interested in the one with the call, so skip until the branch.
+; CHECK: [[FOREND_LABEL]]:
+; CHECK-NEXT: movb _d, [[D:%[a-z]+]]
+; [...]
+; CHECK: jne [[CALL_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $6, [[D]]
+;
+; CHECK: [[CALL_LABEL]]
+;
+; ENABLE-NEXT: pushl
+; ENABLE-NEXT: pushl
+; We must not use sub here otherwise we will clobber the eflags.
+; ENABLE-NEXT: leal -20(%esp), %esp
+;
+; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
+; CHECK-NEXT: movb [[D]], ([[E]])
+; CHECK-NEXT: L_f$non_lazy_ptr, [[F:%[a-z]+]]
+; CHECK-NEXT: movsbl ([[F]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
+; The eflags is used in the next instruction.
+; If that instruction disappear, we are not exercising the bug
+; anymore.
+; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
+;
+; Skip all the crust of vaarg lowering.
+; CHECK: calll L_varfunc$stub
+; Set the return value to 0.
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: retl
+define i32 @eflagsLiveInPrologue() #0 {
+entry:
+  %tmp = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %for.cond.preheader, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i1 true, i1* @d, align 1
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then, %entry
+  %tmp1 = load i32, i32* @b, align 4
+  %tobool14 = icmp eq i32 %tmp1, 0
+  br i1 %tobool14, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond.preheader
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  br label %for.body
+
+for.end:                                          ; preds = %for.cond.preheader
+  %.b3 = load i1, i1* @d, align 1
+  %tmp2 = select i1 %.b3, i8 0, i8 6
+  store i8 %tmp2, i8* @e, align 1
+  %tmp3 = load i8, i8* @f, align 1
+  %conv = sext i8 %tmp3 to i32
+  %add = add nsw i32 %conv, 1
+  %rem = srem i32 %tmp1, %add
+  store i32 %rem, i32* @c, align 4
+  %conv2 = select i1 %.b3, i32 0, i32 6
+  %call = tail call i32 (i8*, ...) @varfunc(i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv2) #1
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @varfunc(i8* nocapture readonly, ...) #0
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index 16877ef70a31..dea66d28e3dd 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 ; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; SELDAG: BB#[[FAILURE]]:
 ; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
 ; SELDAG: BB#[[SUCCESS]]:
 
 ; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; IR: BB#[[SUCCESS]]:
 ; IR: BB#[[FAILURE]]:
 ; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 9026f6f05f0e..6f594868c7ad 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -34,22 +34,22 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
 ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
-; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235)
+; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
 ;
 ; CHECK: BB#4:
 ; BB#4 to BB#1: [155, 159] (50)
 ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
 ;
 ; CHECK: BB#5:
 ; BB#5 to BB#1: {1140} (10)
 ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
 ;
 ; CHECK: BB#6:
 ; BB#6 to BB#1: {1134} (10)
 ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
 }
 
 ; CHECK-LABEL: test2
@@ -102,7 +102,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
 ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
-; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {1} (10)
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
 }
 
 ; CHECK-LABEL: test3
@@ -163,7 +163,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
 ; BB#0 to BB#8: [10, 14] (jump table) (50)
-; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {10} (10)
@@ -171,7 +171,7 @@ sw.epilog:
 ; BB#8 to BB#3: {12} (10)
 ; BB#8 to BB#4: {13} (10)
 ; BB#8 to BB#5: {14} (10)
-; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
 }
 
 ; CHECK-LABEL: test4
@@ -216,12 +216,12 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
 ; BB#0 to BB#7: [111, 115] (bit test) (50)
-; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
 ;
 ; CHECK: BB#7:
 ; BB#7 to BB#2: {111, 114, 115} (30)
 ; BB#7 to BB#3: {112, 113} (20)
-; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
 }
 
 ; CHECK-LABEL: test5
@@ -273,7 +273,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [10, UINT32_MAX] (15)
 ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
-; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
 }
 
 !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 3cfee1cd80e6..896a067da230 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT
+; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
 
 
 ; An unreachable default destination is replaced with the most popular case label.
@@ -54,9 +54,9 @@ default:
 ; Check if branch probabilities are correctly assigned to the jump table.
 
 define void @bar(i32 %x, i32* %to) {
-; CHECK-JT-WEIGHT-LABEL: bar:
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268)
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK-JT-PROB-LABEL: bar:
+; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
+; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
 
 entry:
   switch i32 %x, label %default [
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 52e094b54174..34e56919468b 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -445,9 +445,9 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: xorl %eax, %eax
 ; CHECK-NEXT: %esi, %edi
 ; CHECK-NEXT: %esi, %edx
+; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: %esi, %r8d
 ; CHECK-NEXT: %esi, %r9d
-; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: callq _someVariadicFunc
 ; CHECK-NEXT: movl %eax, %esi
 ; CHECK-NEXT: shll $3, %esi
@@ -788,3 +788,94 @@ end:
   %tmp.0 = phi i32 [ %tmp4, %true ], [ %tmp5, %false ]
   ret i32 %tmp.0
 }
+
+@b = internal unnamed_addr global i1 false
+@c = internal unnamed_addr global i8 0, align 1
+@a = common global i32 0, align 4
+
+; Make sure the prologue does not clobber the EFLAGS when
+; it is live accross.
+; PR25629.
+; Note: The registers may change in the following patterns, but
+; because they imply register hierarchy (e.g., eax, al) this is
+; tricky to write robust patterns.
+;
+; CHECK-LABEL: useLEAForPrologue:
+;
+; Prologue is at the beginning of the function when shrink-wrapping
+; is disabled.
+; DISABLE: pushq
+; The stack adjustment can use SUB instr because we do not need to
+; preserve the EFLAGS at this point.
+; DISABLE-NEXT: subq $16, %rsp
+;
+; Load the value of b.
+; CHECK: movb _b(%rip), [[BOOL:%cl]]
+; Extract i1 from the loaded value.
+; CHECK-NEXT: andb $1, [[BOOL]]
+; Create the zero value for the select assignment.
+; CHECK-NEXT: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
+; CHECK-NEXT: testb [[BOOL]], [[BOOL]]
+; CHECK-NEXT: jne [[STOREC_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $48, [[CMOVE_VAL:%al]]
+;
+; CHECK: [[STOREC_LABEL]]:
+;
+; ENABLE-NEXT: pushq
+; For the stack adjustment, we need to preserve the EFLAGS.
+; ENABLE-NEXT: leaq -16(%rsp), %rsp
+;
+; Technically, we should use CMOVE_VAL here or its subregister.
+; CHECK-NEXT: movb %al, _c(%rip)
+; testb set the EFLAGS read here.
+; CHECK-NEXT: je [[VARFUNC_CALL:LBB[0-9_]+]]
+;
+; The code of the loop is not interesting.
+; [...]
+;
+; CHECK: [[VARFUNC_CALL]]:
+; Set the null parameter.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _varfunc
+;
+; Set the return value.
+; CHECK-NEXT: xorl %eax, %eax
+;
+; Epilogue code.
+; CHECK-NEXT: addq $16, %rsp
+; CHECK-NEXT: popq
+; CHECK-NEXT: retq
+define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
+entry:
+  %tmp = alloca i3
+  %.b = load i1, i1* @b, align 1
+  %bool = select i1 %.b, i8 0, i8 48
+  store i8 %bool, i8* @c, align 1
+  br i1 %.b, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %inc6 = phi i8 [ %c, %for.body.lr.ph ], [ %inc, %for.body ]
+  %cond5 = phi i32 [ %a, %for.body.lr.ph ], [ %conv3, %for.body ]
+  %cmp2 = icmp slt i32 %d, %cond5
+  %conv3 = zext i1 %cmp2 to i32
+  %inc = add i8 %inc6, 1
+  %cmp = icmp slt i8 %inc, 45
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %conv3, i32* @a, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %call = tail call i32 (i8*) @varfunc(i8* null)
+  ret i32 0
+}
+
+declare i32 @varfunc(i8* nocapture readonly)
+
+attributes #3 = { nounwind }
diff --git a/test/DebugInfo/X86/safestack-byval.ll b/test/DebugInfo/X86/safestack-byval.ll
new file mode 100644
index 000000000000..f1f6b6c1d911
--- /dev/null
+++ b/test/DebugInfo/X86/safestack-byval.ll
@@ -0,0 +1,91 @@
+; Test dwarf codegen for DILocalVariable of a byval function argument that
+; points to neither an argument nor an alloca. This kind of IR is generated by
+; SafeStack for unsafe byval arguments.
+; RUN: llc -mtriple=x86_64-unknown-unknown -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s
+
+; This was built by compiling the following source with SafeStack and
+; simplifying the result a little.
+; struct S {
+;   int a[100];
+; };
+;
+; int f(S zzz, unsigned long len) {
+;   return zzz.a[len];
+; }
+
+; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz",
+; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]]
+
+%struct.S = type { [100 x i32] }
+
+@__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
+
+; Function Attrs: norecurse nounwind readonly safestack uwtable
+define i32 @_Z1f1Sm(%struct.S* byval nocapture readonly align 8 %zzz, i64 %len) #0 !dbg !12 {
+entry:
+  %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr, !dbg !22
+  %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22
+  store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr, !dbg !22
+; !17 describes "zzz"
+  call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !17, metadata !23), !dbg !22
+  %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22
+  %zzz.unsafe-byval = bitcast i8* %0 to %struct.S*, !dbg !22
+  %1 = bitcast %struct.S* %zzz to i8*, !dbg !24
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 400, i32 8, i1 false), !dbg !24
+  tail call void @llvm.dbg.value(metadata i64 %len, i64 0, metadata !18, metadata !25), !dbg !24
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz.unsafe-byval, i64 0, i32 0, i64 %len, !dbg !26
+  %2 = load i32, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
+  store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !31
+  ret i32 %2, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { norecurse nounwind readonly safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19, !20}
+!llvm.ident = !{!21}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11)
+!1 = !DIFile(filename: "../llvm/1.cc", directory: "/tmp/build")
+!2 = !{}
+!3 = !{!4}
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 3200, align: 32, elements: !5, identifier: "_ZTS1S")
+!5 = !{!6}
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 3200, align: 32)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 3200, align: 32, elements: !9)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DISubrange(count: 100)
+!11 = !{!12}
+!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1Sm", scope: !1, file: !1, line: 8, type: !13, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, variables: !16)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!8, !"_ZTS1S", !15}
+!15 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!16 = !{!17, !18}
+!17 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 8, type: !"_ZTS1S")
+!18 = !DILocalVariable(name: "len", arg: 2, scope: !12, file: !1, line: 8, type: !15)
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !{i32 2, !"Debug Info Version", i32 3}
+!21 = !{!"clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)"}
+!22 = !DILocation(line: 8, column: 9, scope: !12)
+!23 = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+!24 = !DILocation(line: 8, column: 28, scope: !12)
+!25 = !DIExpression()
+!26 = !DILocation(line: 9, column: 10, scope: !12)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 9, column: 3, scope: !12)
diff --git a/test/Feature/optnone-llc.ll b/test/Feature/optnone-llc.ll
index 015cc842d9f8..94f61efea4aa 100644
--- a/test/Feature/optnone-llc.ll
+++ b/test/Feature/optnone-llc.ll
@@ -3,11 +3,13 @@
 ; RUN: llc -O2 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox
 ; RUN: llc -O3 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox
 ; RUN: llc -misched-postra -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-MORE
+; RUN: llc -O1 -debug-only=isel %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc -O1 -debug-only=isel -fast-isel=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=NOFAST
 
 ; REQUIRES: asserts, default_triple
 
 ; This test verifies that we don't run Machine Function optimizations
-; on optnone functions.
+; on optnone functions, and that we can turn off FastISel.
 
 ; Function Attrs: noinline optnone
 define i32 @_Z3fooi(i32 %x) #0 {
@@ -52,3 +54,7 @@ attributes #0 = { optnone noinline }
 
 ; Alternate post-RA scheduler.
 ; LLC-MORE: Skipping pass 'PostRA Machine Instruction Scheduler'
+
+; Selectively disable FastISel for optnone functions.
+; FAST:   FastISel is enabled
+; NOFAST: FastISel is disabled
diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll
new file mode 100644
index 000000000000..5b7f74cf0b24
--- /dev/null
+++ b/test/Linker/Inputs/comdat11.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = global i8 1, comdat
+define void @zed() {
+  call void @bar()
+  ret void
+}
+define void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll
new file mode 100644
index 000000000000..85515210ed7e
--- /dev/null
+++ b/test/Linker/Inputs/comdat13.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = internal global i8 1, comdat
+define i8* @zed() {
+  call void @bax()
+  ret i8* @foo
+}
+define internal void @bax() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/Inputs/comdat14.ll b/test/Linker/Inputs/comdat14.ll
new file mode 100644
index 000000000000..5e79fbcdacc1
--- /dev/null
+++ b/test/Linker/Inputs/comdat14.ll
@@ -0,0 +1,12 @@
+$c = comdat any
+
+@v2 = weak dllexport global i32 0, comdat ($c)
+define i32* @f2() {
+  ret i32* @v2
+}
+
+@v3 = weak alias i32, i32* @v2
+define i32* @f3() {
+  ret i32* @v3
+}
+
diff --git a/test/Linker/Inputs/comdat15.ll b/test/Linker/Inputs/comdat15.ll
new file mode 100644
index 000000000000..5d2d41bba6aa
--- /dev/null
+++ b/test/Linker/Inputs/comdat15.ll
@@ -0,0 +1,6 @@
+$a1 = comdat any
+@baz = private global i32 42, comdat($a1)
+@a1 = internal alias i32, i32* @baz
+define i32* @abc() {
+  ret i32* @a1
+}
diff --git a/test/Linker/Inputs/ctors3.ll b/test/Linker/Inputs/ctors3.ll
new file mode 100644
index 000000000000..449ccbd90faf
--- /dev/null
+++ b/test/Linker/Inputs/ctors3.ll
@@ -0,0 +1,7 @@
+$foo = comdat any
+%t = type { i8 }
+@foo = global %t zeroinitializer, comdat
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @bar, i8* getelementptr (%t, %t* @foo, i32 0, i32 0) }]
+define internal void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll
new file mode 100644
index 000000000000..dbade4104fe3
--- /dev/null
+++ b/test/Linker/comdat11.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s
+
+$foo = comdat any
+@foo = global i8 0, comdat
+
+; CHECK: @foo = global i8 0, comdat
+
+; CHECK: define void @zed() {
+; CHECK:   call void @bar()
+; CHECK:   ret void
+; CHECK: }
+
+; CHECK: declare void @bar()
diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll
new file mode 100644
index 000000000000..d06e222b63ac
--- /dev/null
+++ b/test/Linker/comdat12.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link %s -S -o - | FileCheck %s
+
+$foo = comdat largest
+define internal void @foo() comdat($foo) {
+  ret void
+}
+
+; CHECK-NOT: foo
diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll
new file mode 100644
index 000000000000..d1e382a2f278
--- /dev/null
+++ b/test/Linker/comdat13.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat13.ll -o - | FileCheck %s
+
+; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an
+; internal function in the comdat $foo.
+; The IR would be ilegal on ELF ("relocation refers to discarded section"),
+; but COFF linkers seem to just duplicate the comdat.
+
+$foo = comdat any
+@foo = internal global i8 0, comdat
+define i8* @bar() {
+       ret i8* @foo
+}
+
+; CHECK: $foo = comdat any
+
+; CHECK: @foo = internal global i8 0, comdat
+; CHECK: @foo.1 = internal global i8 1, comdat($foo)
+
+; CHECK:      define i8* @bar() {
+; CHECK-NEXT:   ret i8* @foo
+; CHECK-NEXT: }
+
+; CHECK:      define i8* @zed() {
+; CHECK-NEXT:   call void @bax()
+; CHECK-NEXT:   ret i8* @foo.1
+; CHECK-NEXT: }
+
+; CHECK:      define internal void @bax() comdat($foo) {
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
diff --git a/test/Linker/comdat14.ll b/test/Linker/comdat14.ll
new file mode 100644
index 000000000000..793f8573a1f5
--- /dev/null
+++ b/test/Linker/comdat14.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat14.ll -o - | FileCheck %s
+
+$c = comdat any
+
+@v = global i32 0, comdat ($c)
+
+; CHECK: @v = global i32 0, comdat($c)
+; CHECK: @v2 = extern_weak dllexport global i32
+; CHECK: @v3 = extern_weak global i32
diff --git a/test/Linker/comdat15.ll b/test/Linker/comdat15.ll
new file mode 100644
index 000000000000..cf900263105a
--- /dev/null
+++ b/test/Linker/comdat15.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat15.ll -o - | FileCheck %s
+
+$a1 = comdat any
+@bar = global i32 0, comdat($a1)
+
+; CHECK: @bar = global i32 0, comdat($a1)
+; CHECK: @baz = private global i32 42, comdat($a1)
+; CHECK: @a1 = internal alias i32, i32* @baz
+
diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
index 274957401aac..4f6f2cfb845d 100644
--- a/test/Linker/comdat9.ll
+++ b/test/Linker/comdat9.ll
@@ -14,6 +14,9 @@ $f2 = comdat largest
 define internal void @f2() comdat($f2) {
   ret void
 }
+define void @f3() comdat($f2) {
+  ret void
+}
 
 ; CHECK-DAG: $f2 = comdat largest
 ; CHECK-DAG: define internal void @f2() comdat {
diff --git a/test/Linker/ctors.ll b/test/Linker/ctors.ll
index 208fe86402ce..37dba23d4c91 100644
--- a/test/Linker/ctors.ll
+++ b/test/Linker/ctors.ll
@@ -4,7 +4,7 @@
 ; RUN:   FileCheck --check-prefix=ALL --check-prefix=CHECK2 %s
 
 ; Test the bitcode writer too. It used to crash.
-; RUN: llvm-link %s %p/Inputs/ctors.ll -o t.bc
+; RUN: llvm-link %s %p/Inputs/ctors.ll -o %t.bc
 
 @v = weak global i8 0
 ; CHECK1: @v = weak global i8 0
diff --git a/test/Linker/ctors3.ll b/test/Linker/ctors3.ll
new file mode 100644
index 000000000000..e62b92dca0b4
--- /dev/null
+++ b/test/Linker/ctors3.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link -S %s %p/Inputs/ctors3.ll -o - | FileCheck %s
+
+$foo = comdat any
+%t = type { i8 }
+@foo = global %t zeroinitializer, comdat
+
+; CHECK: @foo = global %t zeroinitializer, comdat
+; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
diff --git a/test/Linker/ctors4.ll b/test/Linker/ctors4.ll
new file mode 100644
index 000000000000..c4500841f174
--- /dev/null
+++ b/test/Linker/ctors4.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-link -S %s -o - | FileCheck %s
+
+define void @f() {
+  ret void
+}
+
+; We lazy link @v, which causes llvm.global_ctors to have the corresponding
+; entry.
+@v = linkonce global i8 42
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+
diff --git a/test/Linker/funcimport.ll b/test/Linker/funcimport.ll
index 8a71e21d0ad1..38deafd3e3f1 100644
--- a/test/Linker/funcimport.ll
+++ b/test/Linker/funcimport.ll
@@ -26,37 +26,37 @@
 ; lazily linked and never referenced/materialized.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc1:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB1
 ; IMPORTGLOB1-DAG: define available_externally void @globalfunc1
-; IMPORTGLOB1-DAG: declare void @globalfunc2
-; IMPORTGLOB1-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB1-DAG: declare void @weakalias
 ; IMPORTGLOB1-DAG: declare void @analias
-; IMPORTGLOB1-DAG: declare void @linkoncealias
+; IMPORTGLOB1-NOT: @linkoncealias
 ; IMPORTGLOB1-NOT: @linkoncefunc
+; IMPORTGLOB1-NOT: declare void @globalfunc2
 
 ; Ensure that weak alias to a non-imported function is correctly
 ; turned into a declaration, but that strong alias to an imported function
 ; is imported as alias.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc2:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB2
-; IMPORTGLOB2-DAG: declare void @analias()
-; IMPORTGLOB2-DAG: declare void @globalfunc1
+; IMPORTGLOB2-DAG: declare void @analias
 ; IMPORTGLOB2-DAG: define available_externally void @globalfunc2
-; IMPORTGLOB2-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB2-DAG: declare void @weakalias
+; IMPORTGLOB2-NOT: declare void @globalfunc1
 
 ; Ensure that strong alias imported in second pass of importing ends up
 ; as an alias.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc1:%t.bc -import=globalfunc2:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB3
-; IMPORTGLOB3-DAG: declare void @analias()
+; IMPORTGLOB3-DAG: declare void @analias
 ; IMPORTGLOB3-DAG: define available_externally void @globalfunc1
 ; IMPORTGLOB3-DAG: define available_externally void @globalfunc2
-; IMPORTGLOB3-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB3-DAG: declare void @weakalias
 
 ; Ensure that strong alias imported in first pass of importing ends up
 ; as an alias, and that seeing the alias definition during a second inlining
 ; pass is handled correctly.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc2:%t.bc -import=globalfunc1:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB4
-; IMPORTGLOB4-DAG: declare void @analias()
+; IMPORTGLOB4-DAG: declare void @analias
 ; IMPORTGLOB4-DAG: define available_externally void @globalfunc2
 ; IMPORTGLOB4-DAG: define available_externally void @globalfunc1
-; IMPORTGLOB4-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB4-DAG: declare void @weakalias
 
 ; An alias to an imported function is imported as alias if the function is not
 ; available_externally.
@@ -98,9 +98,9 @@
 ; reference should turned into an external_weak declaration.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=callweakfunc:%t.bc -import=weakfunc:%t.bc -S 2>&1 | FileCheck %s --check-prefix=IMPORTWEAKFUNC
 ; IMPORTWEAKFUNC-DAG: Ignoring import request for weak-any function weakfunc
-; IMPORTWEAKFUNC-DAG: @weakvar = extern_weak global i32, align 4
 ; IMPORTWEAKFUNC-DAG: declare extern_weak void @weakfunc
 ; IMPORTWEAKFUNC-DAG: define available_externally void @callweakfunc
+; IMPORTWEAKFUNC-NOT: @weakvar = extern_weak global i32, align 4
 
 @globalvar = global i32 1, align 4
 @staticvar = internal global i32 1, align 4
diff --git a/test/Linker/global_ctors.ll b/test/Linker/global_ctors.ll
index 49df81a00759..cc28471df59d 100644
--- a/test/Linker/global_ctors.ll
+++ b/test/Linker/global_ctors.ll
@@ -1,6 +1,5 @@
-; RUN: llvm-as %s -o %t.new.bc
-; RUN: llvm-link %t.new.bc %S/Inputs/old_global_ctors.3.4.bc | llvm-dis | FileCheck %s
-; RUN: llvm-link %S/Inputs/old_global_ctors.3.4.bc %t.new.bc | llvm-dis | FileCheck %s
+; RUN: llvm-link -S %s %S/Inputs/old_global_ctors.3.4.bc | FileCheck %s
+; RUN: llvm-link -S %S/Inputs/old_global_ctors.3.4.bc %s | FileCheck %s
 
 ; old_global_ctors.3.4.bc contains the following LLVM IL, assembled into
 ; bitcode by llvm-as from 3.4.  It uses a two element @llvm.global_ctors array.
diff --git a/test/MC/AArch64/armv8.2a-statistical-profiling.s b/test/MC/AArch64/armv8.2a-statistical-profiling.s
new file mode 100644
index 000000000000..5cb109318786
--- /dev/null
+++ b/test/MC/AArch64/armv8.2a-statistical-profiling.s
@@ -0,0 +1,87 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+spe < %s | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s 2>&1 | FileCheck --check-prefix=NO_SPE %s
+
+  psb csync
+// CHECK: psb csync              // encoding: [0x3f,0x22,0x03,0xd5]
+// NO_SPE:  invalid operand for instruction
+
+  msr pmblimitr_el1, x0
+  msr pmbptr_el1, x0
+  msr pmbsr_el1, x0
+  msr pmbidr_el1, x0
+  msr pmscr_el2, x0
+  msr pmscr_el12, x0
+  msr pmscr_el1, x0
+  msr pmsicr_el1, x0
+  msr pmsirr_el1, x0
+  msr pmsfcr_el1, x0
+  msr pmsevfr_el1, x0
+  msr pmslatfr_el1, x0
+  msr pmsidr_el1, x0
+// CHECK:     msr PMBLIMITR_EL1, x0       // encoding: [0x00,0x9a,0x18,0xd5]
+// CHECK:     msr PMBPTR_EL1, x0          // encoding: [0x20,0x9a,0x18,0xd5]
+// CHECK:     msr PMBSR_EL1, x0           // encoding: [0x60,0x9a,0x18,0xd5]
+// CHECK:     msr PMBIDR_EL1, x0          // encoding: [0xe0,0x9a,0x18,0xd5]
+// CHECK:     msr PMSCR_EL2, x0           // encoding: [0x00,0x99,0x1c,0xd5]
+// CHECK:     msr PMSCR_EL12, x0          // encoding: [0x00,0x99,0x1d,0xd5]
+// CHECK:     msr PMSCR_EL1, x0           // encoding: [0x00,0x99,0x18,0xd5]
+// CHECK:     msr PMSICR_EL1, x0          // encoding: [0x40,0x99,0x18,0xd5]
+// CHECK:     msr PMSIRR_EL1, x0          // encoding: [0x60,0x99,0x18,0xd5]
+// CHECK:     msr PMSFCR_EL1, x0          // encoding: [0x80,0x99,0x18,0xd5]
+// CHECK:     msr PMSEVFR_EL1, x0         // encoding: [0xa0,0x99,0x18,0xd5]
+// CHECK:     msr PMSLATFR_EL1, x0        // encoding: [0xc0,0x99,0x18,0xd5]
+// CHECK:     msr PMSIDR_EL1, x0          // encoding: [0xe0,0x99,0x18,0xd5]
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+
+mrs x0, pmblimitr_el1
+  mrs x0, pmbptr_el1
+  mrs x0, pmbsr_el1
+  mrs x0, pmbidr_el1
+  mrs x0, pmscr_el2
+  mrs x0, pmscr_el12
+  mrs x0, pmscr_el1
+  mrs x0, pmsicr_el1
+  mrs x0, pmsirr_el1
+  mrs x0, pmsfcr_el1
+  mrs x0, pmsevfr_el1
+  mrs x0, pmslatfr_el1
+  mrs x0, pmsidr_el1
+
+// CHECK:    mrs x0, PMBLIMITR_EL1       // encoding: [0x00,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBPTR_EL1          // encoding: [0x20,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBSR_EL1           // encoding: [0x60,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBIDR_EL1          // encoding: [0xe0,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMSCR_EL2           // encoding: [0x00,0x99,0x3c,0xd5]
+// CHECK:    mrs x0, PMSCR_EL12          // encoding: [0x00,0x99,0x3d,0xd5]
+// CHECK:    mrs x0, PMSCR_EL1           // encoding: [0x00,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSICR_EL1          // encoding: [0x40,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSIRR_EL1          // encoding: [0x60,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSFCR_EL1          // encoding: [0x80,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSEVFR_EL1         // encoding: [0xa0,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSLATFR_EL1        // encoding: [0xc0,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSIDR_EL1          // encoding: [0xe0,0x99,0x38,0xd5]
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
diff --git a/test/MC/AMDGPU/flat-scratch.s b/test/MC/AMDGPU/flat-scratch.s
new file mode 100644
index 000000000000..e68f67f01516
--- /dev/null
+++ b/test/MC/AMDGPU/flat-scratch.s
@@ -0,0 +1,28 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=CI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+s_mov_b64 flat_scratch, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+s_mov_b32 flat_scratch_lo, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+s_mov_b32 flat_scratch_hi, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+
+s_mov_b64 flat_scratch_lo, -1
+// GCN: error: invalid operand for instruction
+
+s_mov_b64 flat_scratch_hi, -1
+// GCN: error: invalid operand for instruction
+
+s_mov_b32 flat_scratch, -1
+// GCN: error: invalid operand for instruction
diff --git a/test/MC/ARM/directive-arch-armv8.2-a.s b/test/MC/ARM/directive-arch-armv8.2-a.s
new file mode 100644
index 000000000000..c9f4469fb0ae
--- /dev/null
+++ b/test/MC/ARM/directive-arch-armv8.2-a.s
@@ -0,0 +1,46 @@
+@ Test the .arch directive for armv8.2-a
+
+@ This test case will check the default .ARM.attributes value for the
+@ armv8-a architecture.
+
+@ RUN: llvm-mc -triple arm-eabi -filetype asm %s \
+@ RUN:   | FileCheck %s -check-prefix CHECK-ASM
+@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \
+@ RUN:   | llvm-readobj -arm-attributes | FileCheck %s -check-prefix CHECK-ATTR
+
+	.syntax	unified
+	.arch	armv8.2-a
+
+@ CHECK-ASM: 	.arch	armv8.2-a
+
+@ CHECK-ATTR: FileAttributes {
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_name
+@ CHECK-ATTR:     Value: 8.2-A
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_arch
+@ CHECK-ATTR:     Description: ARM v8
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_arch_profile
+@ CHECK-ATTR:     Description: Application
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: ARM_ISA_use
+@ CHECK-ATTR:     Description: Permitted
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: THUMB_ISA_use
+@ CHECK-ATTR:     Description: Thumb-2
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: MPextension_use
+@ CHECK-ATTR:     Description: Permitted
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: Virtualization_use
+@ CHECK-ATTR:     Description: TrustZone + Virtualization Extensions
+@ CHECK-ATTR:   }
+@ CHECK-ATTR: }
+
diff --git a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
new file mode 100644
index 000000000000..e83d750e715e
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
@@ -0,0 +1,91 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+spe --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu --disassemble < %s | FileCheck --check-prefix=NO_SPE %s
+
+[0x1f,0x22,0x03,0xd5]
+# CHECK: hint #0x10
+# NO_SPE: hint #0x10
+
+[0x3f,0x22,0x03,0xd5]
+# CHECK: psb csync
+# NO_SPE: hint #0x11
+
+[0x00,0x9a,0x18,0xd5]
+[0x20,0x9a,0x18,0xd5]
+[0x60,0x9a,0x18,0xd5]
+[0xe0,0x9a,0x18,0xd5]
+[0x00,0x99,0x1c,0xd5]
+[0x00,0x99,0x1d,0xd5]
+[0x00,0x99,0x18,0xd5]
+[0x40,0x99,0x18,0xd5]
+[0x60,0x99,0x18,0xd5]
+[0x80,0x99,0x18,0xd5]
+[0xa0,0x99,0x18,0xd5]
+[0xc0,0x99,0x18,0xd5]
+[0xe0,0x99,0x18,0xd5]
+# CHECK: msr PMBLIMITR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_0, x0
+# CHECK: msr PMBPTR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_1, x0
+# CHECK: msr PMBSR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_3, x0
+# CHECK: msr PMBIDR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_7, x0
+# CHECK: msr PMSCR_EL2, x0
+# NO_SPE: msr S3_4_C9_C9_0, x0
+# CHECK: msr PMSCR_EL12, x0
+# NO_SPE: msr S3_5_C9_C9_0, x0
+# CHECK: msr PMSCR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_0, x0
+# CHECK: msr PMSICR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_2, x0
+# CHECK: msr PMSIRR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_3, x0
+# CHECK: msr PMSFCR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_4, x0
+# CHECK: msr PMSEVFR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_5, x0
+# CHECK: msr PMSLATFR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_6, x0
+# CHECK: msr PMSIDR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_7, x0
+
+[0x00,0x9a,0x38,0xd5]
+[0x20,0x9a,0x38,0xd5]
+[0x60,0x9a,0x38,0xd5]
+[0xe0,0x9a,0x38,0xd5]
+[0x00,0x99,0x3c,0xd5]
+[0x00,0x99,0x3d,0xd5]
+[0x00,0x99,0x38,0xd5]
+[0x40,0x99,0x38,0xd5]
+[0x60,0x99,0x38,0xd5]
+[0x80,0x99,0x38,0xd5]
+[0xa0,0x99,0x38,0xd5]
+[0xc0,0x99,0x38,0xd5]
+[0xe0,0x99,0x38,0xd5]
+
+# CHECK: mrs x0, PMBLIMITR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_0
+# CHECK: mrs x0, PMBPTR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_1
+# CHECK: mrs x0, PMBSR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_3
+# CHECK: mrs x0, PMBIDR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_7
+# CHECK: mrs x0, PMSCR_EL2
+# NO_SPE: mrs x0, S3_4_C9_C9_0
+# CHECK: mrs x0, PMSCR_EL12
+# NO_SPE: mrs x0, S3_5_C9_C9_0
+# CHECK: mrs x0, PMSCR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_0
+# CHECK: mrs x0, PMSICR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_2
+# CHECK: mrs x0, PMSIRR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_3
+# CHECK: mrs x0, PMSFCR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_4
+# CHECK: mrs x0, PMSEVFR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_5
+# CHECK: mrs x0, PMSLATFR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_6
+# CHECK: mrs x0, PMSIDR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_7
diff --git a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
index 2bae34e2d833..7fef4e2d08cd 100644
--- a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
@@ -42,6 +42,10 @@
 0x01 0xf0 0xb3 0x3c # CHECK: preceu.ph.qbla $15, $16
 0x02 0x32 0xd1 0x3c # CHECK: preceu.ph.qbr $17, $18
 0x02 0x74 0xd3 0x3c # CHECK: preceu.ph.qbra $19, $20
+0x01 0x49 0x40 0xed # CHECK: precrq.ph.w $8, $9, $10
+0x01 0xac 0x58 0xad # CHECK: precrq.qb.ph $11, $12, $13
+0x02 0x0f 0x71 0x6d # CHECK: precrqu_s.qb.ph $14, $15, $16
+0x02 0x72 0x89 0x2d # CHECK: precrq_rs.ph.w $17, $18, $19
 0x00 0x64 0x53 0xb5 # CHECK: shll.ph $3, $4, 5
 0x00 0x64 0x5b 0xb5 # CHECK: shll_s.ph $3, $4, 5
 0x00 0x64 0xa8 0x7c # CHECK: shll.qb $3, $4, 5
@@ -72,3 +76,21 @@
 0x00 0x62 0x08 0x95 # CHECK: muleu_s.ph.qbl $1, $2, $3
 0x00 0x62 0x08 0xd5 # CHECK: muleu_s.ph.qbr $1, $2, $3
 0x00,0x62,0x09,0x15 # CHECK: mulq_rs.ph $1, $2, $3
+0x00 0x43 0x0a 0x25 # CHECK: lbux $1, $2($3)
+0x00 0x43 0x09 0x65 # CHECK: lhx $1, $2($3)
+0x00 0x43 0x09 0xa5 # CHECK: lwx $1, $2($3)
+0x00 0x62 0x5a 0x7c # CHECK: maq_s.w.phl $ac1, $2, $3
+0x00 0x62 0x7a 0x7c # CHECK: maq_sa.w.phl $ac1, $2, $3
+0x00 0x62 0x4a 0x7c # CHECK: maq_s.w.phr $ac1, $2, $3
+0x00 0x62 0x6a 0x7c # CHECK: maq_sa.w.phr $ac1, $2, $3
+0x00 0x02 0x40 0x7c # CHECK: mfhi $2, $ac1
+0x00 0x01 0x50 0x7c # CHECK: mflo $1, $ac1
+0x00 0x01 0x60 0x7c # CHECK: mthi $1, $ac1
+0x00 0x01 0x70 0x7c # CHECK: mtlo $1, $ac1
+0x00 0x22 0xf1 0x3c # CHECK: raddu.w.qb $1, $2
+0x00 0x20 0x86 0x7c # CHECK: rddsp $1, 2
+0x02 0x00 0x08 0x3d # CHECK: repl.ph $1, 512
+0x00 0x30 0x05 0xfc # CHECK: repl.qb $1, 128
+0x00 0x22 0x03 0x3c # CHECK: replv.ph $1, $2
+0x00 0x22 0x13 0x3c # CHECK: replv.qb $1, $2
+0x00 0x01 0x82 0x7c # CHECK: mthlip $1, $ac2
diff --git a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
index d35f3f0019d3..096b5bda4a34 100644
--- a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
@@ -55,6 +55,13 @@
 0x01 0xf0 0xb3 0x3c # CHECK: preceu.ph.qbla $15, $16
 0x02 0x32 0xd1 0x3c # CHECK: preceu.ph.qbr $17, $18
 0x02 0x74 0xd3 0x3c # CHECK: preceu.ph.qbra $19, $20
+0x00 0x62 0x08 0x6d # CHECK: precr.qb.ph $1, $2, $3
+0x00 0x85 0x0b 0xcd # CHECK: precr_sra.ph.w $4, $5, 1
+0x00 0xc7 0x17 0xcd # CHECK: precr_sra_r.ph.w $6, $7, 2
+0x01 0x49 0x40 0xed # CHECK: precrq.ph.w $8, $9, $10
+0x01 0xac 0x58 0xad # CHECK: precrq.qb.ph $11, $12, $13
+0x02 0x0f 0x71 0x6d # CHECK: precrqu_s.qb.ph $14, $15, $16
+0x02 0x72 0x89 0x2d # CHECK: precrq_rs.ph.w $17, $18, $19
 0x00 0x64 0x53 0xb5 # CHECK: shll.ph $3, $4, 5
 0x00 0x64 0x5b 0xb5 # CHECK: shll_s.ph $3, $4, 5
 0x00 0x64 0xa8 0x7c # CHECK: shll.qb $3, $4, 5
@@ -108,3 +115,4 @@
 0x00 0x62 0x08 0x95 # CHECK: muleu_s.ph.qbl $1, $2, $3
 0x00 0x62 0x08 0xd5 # CHECK: muleu_s.ph.qbr $1, $2, $3
 0x00,0x62,0x09,0x15 # CHECK: mulq_rs.ph $1, $2, $3
+0x00 0x22 0x1a 0x55 # CHECK: prepend $1, $2, 3
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index 619ce3faeda6..5fa2138262a4 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -44,8 +44,8 @@
 0xe0 0x40 0x02 0x9a # CHECK: bgtzalc $2, 1332
 0xe0 0x42 0x02 0x9a # CHECK: bltzalc $2, 1332
 0xc0 0x40 0x02 0x9a # CHECK: blezalc $2, 1332
-0xb4 0x37 0x96 0xb8 # CHECK: balc 14572256
-0x94 0x37 0x96 0xb8 # CHECK: bc 14572256
+0xb4 0x37 0x96 0xb8 # CHECK: balc 7286128
+0x94 0x37 0x96 0xb8 # CHECK: bc 7286128
 0x00 0x44 0x0b 0x3c # CHECK: bitswap $4, $2
 0x00 0x00 0x00 0x07 # CHECK: break
 0x00 0x07 0x00 0x07 # CHECK: break 7
@@ -237,3 +237,19 @@
 0xea 0x11 # CHECK: sw16 $4, 4($17)
 0xe8 0x11 # CHECK: sw16 $zero, 4($17)
 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp)
+0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4
+0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4
+0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4
+0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4
+0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4
+0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4
+0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4
+0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4
+0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2
+0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4
+0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3
+0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8
+0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3
+0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
+0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
+0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index fb641c06bfba..10a9687384ea 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -150,3 +150,19 @@
 0xea 0x11 # CHECK: sw16 $4, 4($17)
 0xe8 0x11 # CHECK: sw16 $zero, 4($17)
 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp)
+0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4
+0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4
+0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4
+0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4
+0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4
+0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4
+0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4
+0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4
+0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2
+0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4
+0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3
+0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8
+0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3
+0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
+0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
+0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
diff --git a/test/MC/Mips/micromips-dsp/valid.s b/test/MC/Mips/micromips-dsp/valid.s
index a78f6c1bf21e..85e3cd6f4b9d 100644
--- a/test/MC/Mips/micromips-dsp/valid.s
+++ b/test/MC/Mips/micromips-dsp/valid.s
@@ -43,6 +43,10 @@
   preceu.ph.qbla $15, $16      # CHECK: preceu.ph.qbla $15, $16  # encoding: [0x01,0xf0,0xb3,0x3c]
   preceu.ph.qbr $17, $18       # CHECK: preceu.ph.qbr $17, $18   # encoding: [0x02,0x32,0xd1,0x3c]
   preceu.ph.qbra $19, $20      # CHECK: preceu.ph.qbra $19, $20  # encoding: [0x02,0x74,0xd3,0x3c]
+  precrq.ph.w $8, $9, $10       # CHECK: precrq.ph.w $8, $9, $10       # encoding: [0x01,0x49,0x40,0xed]
+  precrq.qb.ph $11, $12, $13    # CHECK: precrq.qb.ph $11, $12, $13    # encoding: [0x01,0xac,0x58,0xad]
+  precrqu_s.qb.ph $14, $15, $16 # CHECK: precrqu_s.qb.ph $14, $15, $16 # encoding: [0x02,0x0f,0x71,0x6d]
+  precrq_rs.ph.w $17, $18, $19  # CHECK: precrq_rs.ph.w $17, $18, $19  # encoding: [0x02,0x72,0x89,0x2d]
   shll.ph $3, $4, 5            # CHECK: shll.ph $3, $4, 5       # encoding: [0x00,0x64,0x53,0xb5]
   shll_s.ph $3, $4, 5          # CHECK: shll_s.ph $3, $4, 5     # encoding: [0x00,0x64,0x5b,0xb5]
   shll.qb $3, $4, 5            # CHECK: shll.qb $3, $4, 5       # encoding: [0x00,0x64,0xa8,0x7c]
@@ -73,3 +77,21 @@
   muleu_s.ph.qbl $1, $2, $3    # CHECK: muleu_s.ph.qbl $1, $2, $3  # encoding: [0x00,0x62,0x08,0x95]
   muleu_s.ph.qbr $1, $2, $3    # CHECK: muleu_s.ph.qbr $1, $2, $3  # encoding: [0x00,0x62,0x08,0xd5]
   mulq_rs.ph $1, $2, $3        # CHECK: mulq_rs.ph $1, $2, $3      # encoding: [0x00,0x62,0x09,0x15]
+  lbux $1, $2($3)              # CHECK: lbux $1, $2($3)              # encoding: [0x00,0x43,0x0a,0x25]
+  lhx $1, $2($3)               # CHECK: lhx $1, $2($3)               # encoding: [0x00,0x43,0x09,0x65]
+  lwx $1, $2($3)               # CHECK: lwx $1, $2($3)               # encoding: [0x00,0x43,0x09,0xa5]
+  maq_s.w.phl $ac1, $2, $3     # CHECK: maq_s.w.phl $ac1, $2, $3     # encoding: [0x00,0x62,0x5a,0x7c]
+  maq_sa.w.phl $ac1, $2, $3    # CHECK: maq_sa.w.phl $ac1, $2, $3    # encoding: [0x00,0x62,0x7a,0x7c]
+  maq_s.w.phr $ac1, $2, $3     # CHECK: maq_s.w.phr $ac1, $2, $3     # encoding: [0x00,0x62,0x4a,0x7c]
+  maq_sa.w.phr $ac1, $2, $3    # CHECK: maq_sa.w.phr $ac1, $2, $3    # encoding: [0x00,0x62,0x6a,0x7c]
+  mfhi $2, $ac1                # CHECK: mfhi $2, $ac1                # encoding: [0x00,0x02,0x40,0x7c]
+  mflo $1, $ac1                # CHECK: mflo $1, $ac1                # encoding: [0x00,0x01,0x50,0x7c]
+  mthi $1, $ac1                # CHECK: mthi $1, $ac1                # encoding: [0x00,0x01,0x60,0x7c]
+  mtlo $1, $ac1                # CHECK: mtlo $1, $ac1                # encoding: [0x00,0x01,0x70,0x7c]
+  raddu.w.qb $1, $2            # CHECK: raddu.w.qb $1, $2       # encoding: [0x00,0x22,0xf1,0x3c]
+  rddsp $1, 2                  # CHECK: rddsp $1, 2             # encoding: [0x00,0x20,0x86,0x7c]
+  repl.ph $1, 512              # CHECK: repl.ph $1, 512         # encoding: [0x02,0x00,0x08,0x3d]
+  repl.qb $1, 128              # CHECK: repl.qb $1, 128         # encoding: [0x00,0x30,0x05,0xfc]
+  replv.ph $1, $2              # CHECK: replv.ph $1, $2         # encoding: [0x00,0x22,0x03,0x3c]
+  replv.qb $1, $2              # CHECK: replv.qb $1, $2         # encoding: [0x00,0x22,0x13,0x3c]
+  mthlip $1, $ac2              # CHECK: mthlip $1, $ac2         # encoding: [0x00,0x01,0x82,0x7c]
diff --git a/test/MC/Mips/micromips-dspr2/valid.s b/test/MC/Mips/micromips-dspr2/valid.s
index 37949d4f561d..b236aea152a7 100644
--- a/test/MC/Mips/micromips-dspr2/valid.s
+++ b/test/MC/Mips/micromips-dspr2/valid.s
@@ -56,6 +56,13 @@
   preceu.ph.qbla $15, $16      # CHECK: preceu.ph.qbla $15, $16  # encoding: [0x01,0xf0,0xb3,0x3c]
   preceu.ph.qbr $17, $18       # CHECK: preceu.ph.qbr $17, $18   # encoding: [0x02,0x32,0xd1,0x3c]
   preceu.ph.qbra $19, $20      # CHECK: preceu.ph.qbra $19, $20  # encoding: [0x02,0x74,0xd3,0x3c]
+  precr.qb.ph $1, $2, $3        # CHECK: precr.qb.ph $1, $2, $3        # encoding: [0x00,0x62,0x08,0x6d]
+  precr_sra.ph.w $4, $5, 1      # CHECK: precr_sra.ph.w $4, $5, 1      # encoding: [0x00,0x85,0x0b,0xcd]
+  precr_sra_r.ph.w $6, $7, 2    # CHECK: precr_sra_r.ph.w $6, $7, 2    # encoding: [0x00,0xc7,0x17,0xcd]
+  precrq.ph.w $8, $9, $10       # CHECK: precrq.ph.w $8, $9, $10       # encoding: [0x01,0x49,0x40,0xed]
+  precrq.qb.ph $11, $12, $13    # CHECK: precrq.qb.ph $11, $12, $13    # encoding: [0x01,0xac,0x58,0xad]
+  precrqu_s.qb.ph $14, $15, $16 # CHECK: precrqu_s.qb.ph $14, $15, $16 # encoding: [0x02,0x0f,0x71,0x6d]
+  precrq_rs.ph.w $17, $18, $19  # CHECK: precrq_rs.ph.w $17, $18, $19  # encoding: [0x02,0x72,0x89,0x2d]
   shll.ph $3, $4, 5            # CHECK: shll.ph $3, $4, 5       # encoding: [0x00,0x64,0x53,0xb5]
   shll_s.ph $3, $4, 5          # CHECK: shll_s.ph $3, $4, 5     # encoding: [0x00,0x64,0x5b,0xb5]
   shll.qb $3, $4, 5            # CHECK: shll.qb $3, $4, 5       # encoding: [0x00,0x64,0xa8,0x7c]
@@ -109,3 +116,4 @@
   muleu_s.ph.qbl $1, $2, $3    # CHECK: muleu_s.ph.qbl $1, $2, $3  # encoding: [0x00,0x62,0x08,0x95]
   muleu_s.ph.qbr $1, $2, $3    # CHECK: muleu_s.ph.qbr $1, $2, $3  # encoding: [0x00,0x62,0x08,0xd5]
   mulq_rs.ph $1, $2, $3        # CHECK: mulq_rs.ph $1, $2, $3      # encoding: [0x00,0x62,0x09,0x15]
+  prepend $1, $2, 3            # CHECK: prepend $1, $2, 3          # encoding: [0x00,0x22,0x1a,0x55]
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index b2bce8407656..194b15e1a4f6 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -26,9 +26,9 @@
   bgtzalc $2, 1332         # CHECK: bgtzalc $2, 1332    # encoding: [0xe0,0x40,0x02,0x9a]
   bltzalc $2, 1332         # CHECK: bltzalc $2, 1332    # encoding: [0xe0,0x42,0x02,0x9a]
   blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0xc0,0x40,0x02,0x9a]
-  balc 14572256            # CHECK: balc 14572256       # encoding: [0xb4,0x37,0x96,0xb8]
+  balc 7286128             # CHECK: balc 7286128        # encoding: [0xb4,0x37,0x96,0xb8]
   b 132                    # CHECK: bc16 132            # encoding: [0xcc,0x42]
-  bc 14572256              # CHECK: bc 14572256         # encoding: [0x94,0x37,0x96,0xb8]
+  bc 7286128               # CHECK: bc 7286128          # encoding: [0x94,0x37,0x96,0xb8]
   bc16 132                 # CHECK: bc16 132            # encoding: [0xcc,0x42]
   beqzc16 $6, 20           # CHECK: beqzc16 $6, 20      # encoding: [0x8f,0x0a]
   bnezc16 $6, 20           # CHECK: bnezc16 $6, 20      # encoding: [0xaf,0x0a]
@@ -230,3 +230,19 @@
   lbu $4, 8($5)            # CHECK: lbu $4, 8($5)       # encoding: [0x14,0x85,0x00,0x08]
   lbe $4, 8($5)            # CHECK: lbe $4, 8($5)       # encoding: [0x60,0x85,0x68,0x08]
   lbue $4, 8($5)           # CHECK: lbue $4, 8($5)      # encoding: [0x60,0x85,0x60,0x08]
+  recip.s $f2, $f4         # CHECK: recip.s $f2, $f4    # encoding: [0x54,0x44,0x12,0x3b]
+  recip.d $f2, $f4         # CHECK: recip.d $f2, $f4    # encoding: [0x54,0x44,0x52,0x3b]
+  rint.s $f2, $f4          # CHECK: rint.s $f2, $f4     # encoding: [0x54,0x82,0x00,0x20]
+  rint.d $f2, $f4          # CHECK: rint.d $f2, $f4     # encoding: [0x54,0x82,0x02,0x20]
+  round.l.s $f2, $f4       # CHECK: round.l.s $f2, $f4  # encoding: [0x54,0x44,0x33,0x3b]
+  round.l.d $f2, $f4       # CHECK: round.l.d $f2, $f4  # encoding: [0x54,0x44,0x73,0x3b]
+  round.w.s $f2, $f4       # CHECK: round.w.s $f2, $f4  # encoding: [0x54,0x44,0x3b,0x3b]
+  round.w.d $f2, $f4       # CHECK: round.w.d $f2, $f4  # encoding: [0x54,0x44,0x7b,0x3b]
+  sel.s $f1, $f1, $f2      # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8]
+  sel.d $f0, $f2, $f4      # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8]
+  seleqz.s $f1, $f2, $f3   # CHECK: seleqz.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x38]
+  seleqz.d $f2, $f4, $f8   # CHECK: seleqz.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x38]
+  selnez.s $f1, $f2, $f3   # CHECK: selnez.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x78]
+  selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78]
+  class.s $f2, $f3         # CHECK: class.s $f2, $f3       # encoding: [0x54,0x62,0x00,0x60]
+  class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x54,0x82,0x02,0x60]
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 685810741780..1c8781b6e966 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -130,5 +130,21 @@ a:
         sw16 $0, 4($17)          # CHECK: sw16 $zero, 4($17)  # encoding: [0xe8,0x11]
         swm $16, $17, $ra, 8($sp)   # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a]
         swm16 $16, $17, $ra, 8($sp) # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a]
+        recip.s $f2, $f4         # CHECK: recip.s $f2, $f4    # encoding: [0x54,0x44,0x12,0x3b]
+        recip.d $f2, $f4         # CHECK: recip.d $f2, $f4    # encoding: [0x54,0x44,0x52,0x3b]
+        rint.s $f2, $f4          # CHECK: rint.s $f2, $f4     # encoding: [0x54,0x82,0x00,0x20]
+        rint.d $f2, $f4          # CHECK: rint.d $f2, $f4     # encoding: [0x54,0x82,0x02,0x20]
+        round.l.s $f2, $f4       # CHECK: round.l.s $f2, $f4  # encoding: [0x54,0x44,0x33,0x3b]
+        round.l.d $f2, $f4       # CHECK: round.l.d $f2, $f4  # encoding: [0x54,0x44,0x73,0x3b]
+        round.w.s $f2, $f4       # CHECK: round.w.s $f2, $f4  # encoding: [0x54,0x44,0x3b,0x3b]
+        round.w.d $f2, $f4       # CHECK: round.w.d $f2, $f4  # encoding: [0x54,0x44,0x7b,0x3b]
+        sel.s $f1, $f1, $f2      # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8]
+        sel.d $f0, $f2, $f4      # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8]
+        seleqz.s $f1, $f2, $f3   # CHECK: seleqz.s $f1, $f2, $f3  # encoding: [0x54,0x62,0x08,0x38]
+        seleqz.d $f2, $f4, $f8   # CHECK: seleqz.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x38]
+        selnez.s $f1, $f2, $f3   # CHECK: selnez.s $f1, $f2, $f3  # encoding: [0x54,0x62,0x08,0x78]
+        selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x78]
+        class.s $f2, $f3         # CHECK: class.s $f2, $f3        # encoding: [0x54,0x62,0x00,0x60]
+        class.d $f2, $f4         # CHECK: class.d $f2, $f4        # encoding: [0x54,0x82,0x02,0x60]
 
 1:
diff --git a/test/MC/Mips/mips32r2/invalid-msa.s b/test/MC/Mips/mips32r2/invalid-msa.s
new file mode 100644
index 000000000000..2ad99b147209
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-msa.s
@@ -0,0 +1,62 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        and.v           $w10,$w25,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bmnz.v          $w15,$w2,$w28  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bmz.v           $w13,$w11,$w21 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bsel.v          $w28,$w7,$w0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fclass.d        $w14,$w27      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fclass.w        $w19,$w28      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupl.d        $w10,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupl.w        $w12,$w27      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupr.d        $w31,$w15      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupr.w        $w29,$w12      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_s.d       $w1,$w30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_s.w       $w16,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_u.d       $w23,$w18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_u.w       $w19,$w12      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffql.d          $w2,$w3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffql.w          $w9,$w0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffqr.d          $w25,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffqr.w          $w10,$w6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.b          $w9,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.h          $w9,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.w          $w31,$15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        flog2.d         $w12,$w16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        flog2.w         $w19,$w23      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frcp.d          $w12,$w4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frcp.w          $w30,$w8       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frint.d         $w20,$w8       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frint.w         $w11,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frsqrt.d        $w29,$w2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frsqrt.w        $w9,$w8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fsqrt.d         $w3,$w1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fsqrt.w         $w5,$w15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_s.d       $w31,$w26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_s.w       $w27,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_u.d       $w5,$w31       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_u.w       $w12,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_s.d      $w4,$w22       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_s.w      $w24,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_u.d      $w20,$w25      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_u.w      $w7,$w26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        move.v          $w8,$w17       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.b          $w12,$w30      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.d          $w16,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.h          $w21,$w17      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.w          $w17,$w16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.b          $w12,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.d          $w14,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.h          $w24,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.w          $w10,$w4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nor.v           $w20,$w20,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        or.v            $w13,$w23,$w12 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.b          $w30,$w15      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.d          $w5,$w16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.h          $w20,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.w          $w22,$w20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        xor.v           $w20,$w21,$w30 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index 13385d06ce81..658f172aec3d 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s
index b0fc3a1d23f7..09e19e8bb3b6 100644
--- a/test/MC/Mips/mips32r3/valid-xfail.s
+++ b/test/MC/Mips/mips32r3/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s
index a821dddb85ca..30fc4b98e056 100644
--- a/test/MC/Mips/mips32r5/valid-xfail.s
+++ b/test/MC/Mips/mips32r5/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 148758cd3263..5faa29d6468e 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s
index f2949c4f2dda..dcf66bf97d68 100644
--- a/test/MC/Mips/mips64r3/valid-xfail.s
+++ b/test/MC/Mips/mips64r3/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s
index 04221ddb8630..0f7788359cf2 100644
--- a/test/MC/Mips/mips64r5/valid-xfail.s
+++ b/test/MC/Mips/mips64r5/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index 78c5738c7acc..2f3cf7d3f043 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg
@@ -1,6 +1,2 @@
 if not 'SystemZ' in config.root.targets:
     config.unsupported = True
-
-# http://llvm.org/bugs/show_bug.cgi?id=20980
-if 'ubsan' in config.available_features:
-  config.unsupported = True
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 7ab94e729611..2043100bf3e6 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -19093,3 +19093,130 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK:  encoding: [0x62,0x41,0xfd,0x08,0xd6,0xc4]
           vmovq.s  %xmm24, %xmm12
 
+// CHECK: vcomisd %xmm21, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x2f,0xfd]
+          vcomisd %xmm21, %xmm23
+
+// CHECK: vcomisd {sae}, %xmm21, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x18,0x2f,0xfd]
+          vcomisd {sae}, %xmm21, %xmm23
+
+// CHECK: vcomisd (%rcx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x39]
+          vcomisd (%rcx), %xmm23
+
+// CHECK: vcomisd 291(%rax,%r14,8), %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x2f,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vcomisd 291(%rax,%r14,8), %xmm23
+
+// CHECK: vcomisd 1016(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x7a,0x7f]
+          vcomisd 1016(%rdx), %xmm23
+
+// CHECK: vcomisd 1024(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0xba,0x00,0x04,0x00,0x00]
+          vcomisd 1024(%rdx), %xmm23
+
+// CHECK: vcomisd -1024(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x7a,0x80]
+          vcomisd -1024(%rdx), %xmm23
+
+// CHECK: vcomisd -1032(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0xba,0xf8,0xfb,0xff,0xff]
+          vcomisd -1032(%rdx), %xmm23
+
+// CHECK: vcomiss %xmm28, %xmm14
+// CHECK:  encoding: [0x62,0x11,0x7c,0x08,0x2f,0xf4]
+          vcomiss %xmm28, %xmm14
+
+// CHECK: vcomiss {sae}, %xmm28, %xmm14
+// CHECK:  encoding: [0x62,0x11,0x7c,0x18,0x2f,0xf4]
+          vcomiss {sae}, %xmm28, %xmm14
+
+// CHECK: vcomiss (%rcx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0x31]
+          vcomiss (%rcx), %xmm14
+
+// CHECK: vcomiss 291(%rax,%r14,8), %xmm14
+// CHECK:  encoding: [0xc4,0x21,0x78,0x2f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcomiss 291(%rax,%r14,8), %xmm14
+
+// CHECK: vcomiss 508(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0xfc,0x01,0x00,0x00]
+          vcomiss 508(%rdx), %xmm14
+
+// CHECK: vcomiss 512(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0x00,0x02,0x00,0x00]
+          vcomiss 512(%rdx), %xmm14
+
+// CHECK: vcomiss -512(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0x00,0xfe,0xff,0xff]
+          vcomiss -512(%rdx), %xmm14
+
+// CHECK: vcomiss -516(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0xfc,0xfd,0xff,0xff]
+          vcomiss -516(%rdx), %xmm14
+
+// CHECK: vucomisd %xmm10, %xmm11
+// CHECK:  encoding: [0xc4,0x41,0x79,0x2e,0xda]
+          vucomisd %xmm10, %xmm11
+
+// CHECK: vucomisd {sae}, %xmm10, %xmm11
+// CHECK:  encoding: [0x62,0x51,0xfd,0x18,0x2e,0xda]
+          vucomisd {sae}, %xmm10, %xmm11
+
+// CHECK: vucomisd (%rcx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x19]
+          vucomisd (%rcx), %xmm11
+
+// CHECK: vucomisd 291(%rax,%r14,8), %xmm11
+// CHECK:  encoding: [0xc4,0x21,0x79,0x2e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vucomisd 291(%rax,%r14,8), %xmm11
+
+// CHECK: vucomisd 1016(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0xf8,0x03,0x00,0x00]
+          vucomisd 1016(%rdx), %xmm11
+
+// CHECK: vucomisd 1024(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0x00,0x04,0x00,0x00]
+          vucomisd 1024(%rdx), %xmm11
+
+// CHECK: vucomisd -1024(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0x00,0xfc,0xff,0xff]
+          vucomisd -1024(%rdx), %xmm11
+
+// CHECK: vucomisd -1032(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0xf8,0xfb,0xff,0xff]
+          vucomisd -1032(%rdx), %xmm11
+
+// CHECK: vucomiss %xmm11, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x7c,0x08,0x2e,0xf3]
+          vucomiss %xmm11, %xmm22
+
+// CHECK: vucomiss {sae}, %xmm11, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x7c,0x18,0x2e,0xf3]
+          vucomiss {sae}, %xmm11, %xmm22
+
+// CHECK: vucomiss (%rcx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x31]
+          vucomiss (%rcx), %xmm22
+
+// CHECK: vucomiss 291(%rax,%r14,8), %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x2e,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vucomiss 291(%rax,%r14,8), %xmm22
+
+// CHECK: vucomiss 508(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x72,0x7f]
+          vucomiss 508(%rdx), %xmm22
+
+// CHECK: vucomiss 512(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0x00,0x02,0x00,0x00]
+          vucomiss 512(%rdx), %xmm22
+
+// CHECK: vucomiss -512(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x72,0x80]
+          vucomiss -512(%rdx), %xmm22
+
+// CHECK: vucomiss -516(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0xfc,0xfd,0xff,0xff]
+          vucomiss -516(%rdx), %xmm22
diff --git a/test/Transforms/FunctionImport/funcimport.ll b/test/Transforms/FunctionImport/funcimport.ll
index 2c9164fdaa98..c099b9766477 100644
--- a/test/Transforms/FunctionImport/funcimport.ll
+++ b/test/Transforms/FunctionImport/funcimport.ll
@@ -25,11 +25,11 @@ entry:
 }
 
 ; Won't import weak alias
-; CHECK-DAG: declare extern_weak void @weakalias()
+; CHECK-DAG: declare void @weakalias
 declare void @weakalias(...) #1
 
 ; Cannot create an alias to available_externally
-; CHECK-DAG: declare void @analias()
+; CHECK-DAG: declare void @analias
 declare void @analias(...) #1
 
 ; Aliases import the aliasee function
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index a85bee5c2c9b..7d6ec96b5328 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -819,8 +819,8 @@ define i1 @test68(i32 %x) nounwind uwtable {
 
 ; PR14708
 ; CHECK-LABEL: @test69(
-; CHECK: %1 = and i32 %c, -33
-; CHECK: %2 = icmp eq i32 %1, 65
+; CHECK: %1 = or i32 %c, 32
+; CHECK: %2 = icmp eq i32 %1, 97
 ; CHECK: ret i1 %2
 define i1 @test69(i32 %c) nounwind uwtable {
   %1 = icmp eq i32 %c, 97
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 8929c82def7b..6841cd64abe3 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -24,14 +24,57 @@ define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
   ret <4 x i16> %vec.3
 }
 
-define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: @test_vcopyq_lane_p64
-; CHECK: extractelement
-; CHECK: insertelement
-; CHECK-NOT: shufflevector
-entry:
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <2 x i64> %res
   %elt = extractelement <1 x i64> %b, i32 0
   %res = insertelement <2 x i64> %a, i64 %elt, i32 1
   ret <2 x i64> %res
 }
 
+; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109
+
+define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @widen_extract2(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <4 x float> %i2
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %e2 = extractelement <2 x float> %ext, i32 1
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 1
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 3
+  ret <4 x float> %i2
+}
+
+define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
+; CHECK-LABEL: @widen_extract3(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <4 x float> %i3
+  %e1 = extractelement <3 x float> %ext, i32 0
+  %e2 = extractelement <3 x float> %ext, i32 1
+  %e3 = extractelement <3 x float> %ext, i32 2
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 2
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 1
+  %i3 = insertelement <4 x float> %i2, float %e3, i32 0
+  ret <4 x float> %i3
+}
+
+define <8 x float> @too_wide(<8 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @too_wide(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <8 x float> %i1
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %i1 = insertelement <8 x float> %ins, float %e1, i32 2
+  ret <8 x float> %i1
+}
+
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 0bd9eb52191a..fe1bf1517539 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -148,8 +148,8 @@ define i1 @test8(i32 %X) {
   %S = icmp eq i16 %R, 0
   ret i1 %S
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT: and i32 %X, -2
-; CHECK-NEXT: icmp eq i32 {{.*}}, 8
+; CHECK-NEXT: or i32 %X, 1
+; CHECK-NEXT: icmp eq i32 {{.*}}, 9
 ; CHECK-NEXT: ret i1
 }
 
diff --git a/test/Transforms/InstCombine/log-pow-nofastmath.ll b/test/Transforms/InstCombine/log-pow-nofastmath.ll
index 0811e63cc74a..faaef97311ec 100644
--- a/test/Transforms/InstCombine/log-pow-nofastmath.ll
+++ b/test/Transforms/InstCombine/log-pow-nofastmath.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define double @mylog(double %x, double %y) #0 {
+define double @mylog(double %x, double %y) {
 entry:
   %pow = call double @llvm.pow.f64(double %x, double %y)
-  %call = call double @log(double %pow) #0
+  %call = call double @log(double %pow)
   ret double %call
 }
 
@@ -13,5 +13,18 @@ entry:
 ; CHECK:   ret double %call
 ; CHECK: }
 
-declare double @log(double) #0
+define double @test3(double %x) {
+  %call2 = call double @exp2(double %x)
+  %call3 = call double @log(double %call2)
+  ret double %call3
+}
+
+; CHECK-LABEL: @test3
+; CHECK:   %call2 = call double @exp2(double %x)
+; CHECK:   %call3 = call double @log(double %call2)
+; CHECK:   ret double %call3
+; CHECK: }
+
+declare double @log(double)
+declare double @exp2(double)
 declare double @llvm.pow.f64(double, double)
diff --git a/test/Transforms/InstCombine/log-pow.ll b/test/Transforms/InstCombine/log-pow.ll
index c98a1a5bc628..1acd0354431e 100644
--- a/test/Transforms/InstCombine/log-pow.ll
+++ b/test/Transforms/InstCombine/log-pow.ll
@@ -22,7 +22,20 @@ define double @test2(double ()* %fptr, double %p1) #0 {
 ; CHECK-LABEL: @test2
 ; CHECK: log
 
+define double @test3(double %x) #0 {
+  %call2 = call double @exp2(double %x) #0
+  %call3 = call double @log(double %call2) #0
+  ret double %call3
+}
+
+; CHECK-LABEL: @test3
+; CHECK:  %call2 = call double @exp2(double %x) #0
+; CHECK:  %logmul = fmul fast double %x, 0x3FE62E42FEFA39EF
+; CHECK:  ret double %logmul
+; CHECK: }
+
 declare double @log(double) #0
+declare double @exp2(double) #0
 declare double @llvm.pow.f64(double, double)
 
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index b91a5954d97e..a2bc4e7d9832 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -182,7 +182,7 @@ define i1 @test19(i32 %A) {
         %D = or i1 %B, %C
         ret i1 %D
 ; CHECK-LABEL: @test19(
-; CHECK: and i32
+; CHECK: or i32
 ; CHECK: icmp eq 
 ; CHECK: ret i1
 }
diff --git a/test/Transforms/SafeStack/array.ll b/test/Transforms/SafeStack/array.ll
index 6036bfc2c9c5..b2454dc2bb9e 100644
--- a/test/Transforms/SafeStack/array.ll
+++ b/test/Transforms/SafeStack/array.ll
@@ -35,4 +35,52 @@ entry:
   ret void
 }
 
+; Load from an array at a fixed offset, no overflow.
+define i8 @StaticArrayFixedSafe() nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayFixedSafe(
+  ; CHECK-NOT: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 2
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array at a fixed offset with overflow.
+define i8 @StaticArrayFixedUnsafe() nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayFixedUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 5
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array at an unknown offset.
+define i8 @StaticArrayVariableUnsafe(i32 %ofs) nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayVariableUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 %ofs
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array of an unknown size.
+define i8 @DynamicArrayUnsafe(i32 %sz) nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @DynamicArrayUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 %sz, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 2
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
 declare i8* @strcpy(i8*, i8*)
diff --git a/test/Transforms/SafeStack/byval.ll b/test/Transforms/SafeStack/byval.ll
new file mode 100644
index 000000000000..f9a06e54d2df
--- /dev/null
+++ b/test/Transforms/SafeStack/byval.ll
@@ -0,0 +1,51 @@
+; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { [100 x i32] }
+
+; Safe access to a byval argument.
+define i32 @ByValSafe(%struct.S* byval nocapture readonly align 8 %zzz) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValSafe
+  ; CHECK-NOT: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Unsafe access to a byval argument.
+; Argument is copied to the unsafe stack.
+define i32 @ByValUnsafe(%struct.S* byval nocapture readonly align 8 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValUnsafe
+  ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: store {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: %[[B:.*]] = getelementptr i8, i8* %[[A]], i32 -400
+  ; CHECK: %[[C:.*]] = bitcast %struct.S* %zzz to i8*
+  ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[B]], i8* %[[C]], i64 400, i32 8, i1 false)
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Highly aligned byval argument.
+define i32 @ByValUnsafeAligned(%struct.S* byval nocapture readonly align 64 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValUnsafeAligned
+  ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: %[[B:.*]] = ptrtoint i8* %[[A]] to i64
+  ; CHECK: and i64 %[[B]], -64
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 0
+  %0 = load i32, i32* %arrayidx, align 64
+  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
diff --git a/test/Transforms/SafeStack/debug-loc.ll b/test/Transforms/SafeStack/debug-loc.ll
index 57d565f8cfd0..e72d0e9d2ff2 100644
--- a/test/Transforms/SafeStack/debug-loc.ll
+++ b/test/Transforms/SafeStack/debug-loc.ll
@@ -1,83 +1,83 @@
 ; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
 
 ; Test debug location for the local variables moved onto the unsafe stack.
-; CHECK: define void @f
-; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 
-; dbg.declare for %buf is gone; replaced with dbg.declare based off the unsafe stack pointer
-; CHECK-NOT: @llvm.dbg.declare.*%buf
-; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR:.*]], metadata ![[EXPR:.*]])
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
-; dbg.declare appears before the first use of %buf
-; CHECK: getelementptr{{.*}}%buf
-; CHECK: call{{.*}}@Capture
-; CHECK: ret void
+%struct.S = type { [100 x i8] }
 
-; dbg.declare describes "buf"...
-; CHECK: ![[VAR]] = !DILocalVariable(name: "buf"
+; Function Attrs: safestack uwtable
+define void @f(%struct.S* byval align 8 %zzz) #0 !dbg !12 {
+; CHECK: define void @f
 
-; ... as an offset from the unsafe stack pointer
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+entry:
+; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 
+  %xxx = alloca %struct.S, align 1
+  call void @llvm.dbg.declare(metadata %struct.S* %zzz, metadata !18, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata %struct.S* %xxx, metadata !21, metadata !19), !dbg !22
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; dbg.declare for %zzz and %xxx are gone; replaced with dbg.declare based off the unsafe stack pointer
+; CHECK-NOT: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_ARG:.*]], metadata ![[EXPR_ARG:.*]])
+; CHECK-NOT: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_LOCAL:.*]], metadata ![[EXPR_LOCAL:.*]])
+; CHECK-NOT: call void @llvm.dbg.declare
 
-; Function Attrs: safestack uwtable
-define void @f() #0 !dbg !4 {
-entry:
-  %buf = alloca [100 x i32], align 16
-  %0 = bitcast [100 x i32]* %buf to i8*, !dbg !16
-  call void @llvm.lifetime.start(i64 400, i8* %0) #4, !dbg !16
-  tail call void @llvm.dbg.declare(metadata [100 x i32]* %buf, metadata !8, metadata !17), !dbg !18
+  call void @Capture(%struct.S* %zzz), !dbg !23
+  call void @Capture(%struct.S* %xxx), !dbg !24
 
+; dbg.declare appears before the first use
+; CHECK:   call void @Capture
+; CHECK:   call void @Capture
 
-  %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !19
-  call void @Capture(i32* %arraydecay), !dbg !20
-  call void @llvm.lifetime.end(i64 400, i8* %0) #4, !dbg !21
-  ret void, !dbg !21
+  ret void, !dbg !25
 }
 
-; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+; CHECK-DAG: ![[VAR_ARG]] = !DILocalVariable(name: "zzz"
+; 100 aligned up to 8
+; CHECK-DAG: ![[EXPR_ARG]] = !DIExpression(DW_OP_deref, DW_OP_minus, 104
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+; CHECK-DAG: ![[VAR_LOCAL]] = !DILocalVariable(name: "xxx"
+; CHECK-DAG: ![[EXPR_LOCAL]] = !DIExpression(DW_OP_deref, DW_OP_minus, 208
 
-declare void @Capture(i32*) #3
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @Capture(%struct.S*) #2
 
-attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind argmemonly }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!13, !14}
-!llvm.ident = !{!15}
+!llvm.module.flags = !{!15, !16}
+!llvm.ident = !{!17}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
-!1 = !DIFile(filename: "1.cc", directory: "/tmp")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11)
+!1 = !DIFile(filename: "../llvm/2.cc", directory: "/code/build-llvm")
 !2 = !{}
 !3 = !{!4}
-!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, variables: !7)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null}
-!7 = !{!8}
-!8 = !DILocalVariable(name: "buf", scope: !4, file: !1, line: 5, type: !9)
-!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 3200, align: 32, elements: !11)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 800, align: 8, elements: !5, identifier: "_ZTS1S")
+!5 = !{!6}
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 800, align: 8)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 800, align: 8, elements: !9)
+!8 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!9 = !{!10}
+!10 = !DISubrange(count: 100)
 !11 = !{!12}
-!12 = !DISubrange(count: 100)
-!13 = !{i32 2, !"Dwarf Version", i32 4}
-!14 = !{i32 2, !"Debug Info Version", i32 3}
-!15 = !{!"clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)"}
-!16 = !DILocation(line: 5, column: 3, scope: !4)
-!17 = !DIExpression()
-!18 = !DILocation(line: 5, column: 7, scope: !4)
-!19 = !DILocation(line: 6, column: 11, scope: !4)
-!20 = !DILocation(line: 6, column: 3, scope: !4)
-!21 = !DILocation(line: 7, column: 1, scope: !4)
+!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1S", scope: !1, file: !1, line: 10, type: !13, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !"_ZTS1S"}
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{i32 2, !"Debug Info Version", i32 3}
+!17 = !{!"clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)"}
+!18 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 10, type: !"_ZTS1S")
+!19 = !DIExpression()
+!20 = !DILocation(line: 10, column: 10, scope: !12)
+!21 = !DILocalVariable(name: "xxx", scope: !12, file: !1, line: 11, type: !"_ZTS1S")
+!22 = !DILocation(line: 11, column: 5, scope: !12)
+!23 = !DILocation(line: 12, column: 3, scope: !12)
+!24 = !DILocation(line: 13, column: 3, scope: !12)
+!25 = !DILocation(line: 14, column: 1, scope: !12)
diff --git a/test/lit.cfg b/test/lit.cfg
index 3f710debf23a..b77f892d4d92 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -374,6 +374,10 @@ if config.target_triple:
 if config.host_triple == config.target_triple:
     config.available_features.add("native")
 
+# Not set for targeting tls-incapable targets.
+if not re.match(r'.*-cygwin$', config.target_triple):
+    config.available_features.add('tls')
+
 import subprocess
 
 def have_ld_plugin_support():
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.cpp b/test/tools/llvm-dwp/Inputs/simple/a.cpp
new file mode 100644
index 000000000000..f85d105a99f8
--- /dev/null
+++ b/test/tools/llvm-dwp/Inputs/simple/a.cpp
@@ -0,0 +1,2 @@
+struct foo { };
+foo a;
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/a.dwo
new file mode 100644
index 000000000000..7bdb2a7b9f82
Binary files /dev/null and b/test/tools/llvm-dwp/Inputs/simple/a.dwo differ
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.cpp b/test/tools/llvm-dwp/Inputs/simple/b.cpp
new file mode 100644
index 000000000000..fedcc160f41e
--- /dev/null
+++ b/test/tools/llvm-dwp/Inputs/simple/b.cpp
@@ -0,0 +1,3 @@
+struct bar { };
+void b(bar) {
+}
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/b.dwo
new file mode 100644
index 000000000000..f41243dc722b
Binary files /dev/null and b/test/tools/llvm-dwp/Inputs/simple/b.dwo differ
diff --git a/test/tools/llvm-dwp/X86/lit.local.cfg b/test/tools/llvm-dwp/X86/lit.local.cfg
new file mode 100644
index 000000000000..05f8b38b3346
--- /dev/null
+++ b/test/tools/llvm-dwp/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
+config.suffixes = ['.test', '.cpp', '.m', '.s']
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
new file mode 100644
index 000000000000..6ee19697442d
--- /dev/null
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -0,0 +1,61 @@
+RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
+RUN: llvm-dwarfdump %t | FileCheck %s
+
+FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.
+
+DWP from non-type-unit debug info for these two translation units:
+a.cpp:
+  struct foo { };
+  foo a;
+
+b.cpp:
+  struct bar { };
+  void b(bar) {
+  }
+
+CHECK: .debug_abbrev.dwo contents:
+CHECK: Abbrev table for offset: 0x00000000
+CHECK: DW_TAG_compile_unit
+CHECK: DW_TAG_variable
+CHECK: DW_TAG_structure_type
+CHECK: Abbrev table for offset: 0x00000031
+CHECK: DW_TAG_compile_unit
+CHECK: DW_TAG_structure_type
+CHECK: DW_TAG_subprogram
+CHECK: DW_TAG_formal_parameter
+
+CHECK: .debug_info.dwo contents:
+CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029)
+CHECK: DW_TAG_compile_unit
+CHECK:   DW_AT_name {{.*}} "a.cpp"
+CHECK:   DW_TAG_variable
+CHECK:     DW_AT_name {{.*}} "a"
+CHECK:   DW_TAG_structure_type
+CHECK:     DW_AT_name {{.*}} "foo"
+
+CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0031 addr_size = 0x08 (next unit at 0x0000005e)
+CHECK:   DW_AT_name {{.*}} "b.cpp"
+CHECK:   DW_TAG_structure_type
+CHECK:     DW_AT_name {{.*}} "bar"
+CHECK:   DW_TAG_subprogram
+CHECK:     DW_AT_name {{.*}} "b"
+CHECK:     DW_TAG_formal_parameter
+
+CHECK: .debug_cu_index contents:
+Ensure only the relevant/contained sections are included in the table:
+CHECK: Index Signature          INFO                     ABBREV                   STR_OFFSETS
+Don't bother checking the Signatures, they aren't correct yet.
+CHECK:                          [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
+CHECK:                          [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
+
+CHECK: .debug_str.dwo contents:
+CHECK: "clang version
+CHECK: 0x[[ACPP:.*]]: "a.cpp"
+CHECK-NOT: "clang version
+CHECK: 0x[[BCPP:.*]]: "b.cpp"
+
+CHECK: .debug_str_offsets.dwo contents:
+CHECK: : 00000000
+CHECK: : [[ACPP]]
+CHECK: : 00000000
+CHECK: : [[BCPP]]
diff --git a/test/tools/llvm-profdata/Inputs/compat.profdata.v2 b/test/tools/llvm-profdata/Inputs/compat.profdata.v2
new file mode 100644
index 000000000000..969867584a99
Binary files /dev/null and b/test/tools/llvm-profdata/Inputs/compat.profdata.v2 differ
diff --git a/test/tools/llvm-profdata/compat.proftext b/test/tools/llvm-profdata/compat.proftext
index 14da3374b5e9..139202d162e6 100644
--- a/test/tools/llvm-profdata/compat.proftext
+++ b/test/tools/llvm-profdata/compat.proftext
@@ -45,3 +45,23 @@ large_numbers
 # SUMMARY: Total functions: 3
 # SUMMARY: Maximum function count: 2305843009213693952
 # SUMMARY: Maximum internal block count: 1152921504606846976
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v2 -all-functions --counts | FileCheck %s -check-prefix=FORMATV2
+
+# FORMATV2: Counters:
+# FORMATV2-NEXT:   foo:
+# FORMATV2-NEXT:     Hash: 0x000000000000000a
+# FORMATV2-NEXT:     Counters: 2
+# FORMATV2-NEXT:     Function count: 499500
+# FORMATV2-NEXT:     Block counts: [179900]
+# FORMATV2-NEXT:   main:
+# FORMATV2-NEXT:     Hash: 0x000000000000410a
+# FORMATV2-NEXT:     Counters: 4
+# FORMATV2-NEXT:     Function count: 1
+# FORMATV2-NEXT:     Block counts: [1000, 1000000, 499500]
+# FORMATV2-NEXT: Functions shown: 2
+# FORMATV2-NEXT: Total functions: 2
+# FORMATV2-NEXT: Maximum function count: 499500
+# FORMATV2-NEXT: Maximum internal block count: 1000000
+
+
diff --git a/test/tools/llvm-profdata/overflow.proftext b/test/tools/llvm-profdata/overflow.proftext
index cbf3bf161823..b8401ffd84df 100644
--- a/test/tools/llvm-profdata/overflow.proftext
+++ b/test/tools/llvm-profdata/overflow.proftext
@@ -1,12 +1,20 @@
-# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s
-# CHECK: overflow.proftext: overflow: Counter overflow
+# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s --check-prefix=MERGE
+# RUN: llvm-profdata show %t.out | FileCheck %s --check-prefix=SHOW
+# MERGE: overflow.proftext: overflow: Counter overflow
+# SHOW: Total functions: 1
+# SHOW: Maximum function count: 18446744073709551615
+# SHOW: Maximum internal block count: 18446744073709551615
 
 overflow
 1
-1
+3
+18446744073709551615
 9223372036854775808
+18446744073709551615
 
 overflow
 1
-1
+3
+9223372036854775808
 9223372036854775808
+0
diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index d6c08d25d944..d4b014771859 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -28,6 +28,7 @@ subdirectories =
  llvm-diff
  llvm-dis
  llvm-dwarfdump
+ llvm-dwp
  llvm-extract
  llvm-jitlistener
  llvm-link
diff --git a/tools/Makefile b/tools/Makefile
index a2ec8b065ebd..92d495451879 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -33,7 +33,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis llc llvm-ar llvm-nm llvm-link \
                  llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \
                  llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test \
                  llvm-cxxdump verify-uselistorder dsymutil llvm-pdbdump \
-                 llvm-split sancov
+                 llvm-split sancov llvm-dwp
 
 # If Intel JIT Events support is configured, build an extra tool to test it.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index d0731f5f6d32..39887d5d59dc 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -132,7 +132,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
     if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
-    if (Linker::LinkModules(Program, M.get()))
+    if (Linker::linkModules(*Program, *M))
       return true;
   }
 
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 0ca47e770411..e7eae40ec95a 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -222,7 +222,7 @@ static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
     M1 = CloneModule(M1);
     M2 = CloneModule(M2);
   }
-  if (Linker::LinkModules(M1, M2))
+  if (Linker::linkModules(*M1, *M2))
     exit(1);
   delete M2;   // We are done with this module.
 
@@ -390,7 +390,7 @@ static bool ExtractLoops(BugDriver &BD,
         MisCompFunctions.emplace_back(F->getName(), F->getFunctionType());
       }
 
-      if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
+      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
         exit(1);
 
       MiscompiledFunctions.clear();
@@ -418,7 +418,7 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction both didn't break the program, and didn't mask the problem.
     // Replace the current program with the loop extracted version, and try to
     // extract another loop.
-    if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
+    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
       exit(1);
 
     delete ToOptimizeLoopExtracted;
@@ -594,7 +594,7 @@ static bool ExtractBlocks(BugDriver &BD,
     if (!I->isDeclaration())
       MisCompFunctions.emplace_back(I->getName(), I->getFunctionType());
 
-  if (Linker::LinkModules(ProgClone, Extracted.get()))
+  if (Linker::linkModules(*ProgClone, *Extracted))
     exit(1);
 
   // Set the new program and delete the old one.
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index b8318be27ce0..1bd2f8afb290 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -938,7 +938,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   }
 
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
-  Linker L(Combined.get());
+  Linker L(*Combined);
 
   std::string DefaultTriple = sys::getDefaultTargetTriple();
 
@@ -956,7 +956,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
       M->setTargetTriple(DefaultTriple);
     }
 
-    if (L.linkInModule(M.get()))
+    if (L.linkInModule(*M))
       message(LDPL_FATAL, "Failed to link module");
     if (release_input_file(F.handle) != LDPS_OK)
       message(LDPL_FATAL, "Failed to release file information");
@@ -986,7 +986,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
       path = output_name;
     else
       path = output_name + ".bc";
-    saveBCFile(path, *L.getModule());
+    saveBCFile(path, *Combined);
     if (options::TheOutputType == options::OT_BC_ONLY)
       return LDPS_OK;
   }
diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt
new file mode 100644
index 000000000000..b29c00d49c3d
--- /dev/null
+++ b/tools/llvm-dwp/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  AsmPrinter
+  DebugInfoDWARF
+  MC
+  Object
+  Support
+  Target
+  )
+
+add_llvm_tool(llvm-dwp
+  llvm-dwp.cpp
+  )
diff --git a/tools/llvm-dwp/LLVMBuild.txt b/tools/llvm-dwp/LLVMBuild.txt
new file mode 100644
index 000000000000..345a73757255
--- /dev/null
+++ b/tools/llvm-dwp/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./tools/llvm-dwp/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-dwp
+parent = Tools
+required_libraries = AsmPrinter DebugInfoDWARF MC Object Support all-targets
+
diff --git a/tools/llvm-dwp/Makefile b/tools/llvm-dwp/Makefile
new file mode 100644
index 000000000000..826371ecf915
--- /dev/null
+++ b/tools/llvm-dwp/Makefile
@@ -0,0 +1,18 @@
+##===- tools/llvm-dwp/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-dwp
+LINK_COMPONENTS := all-targets AsmPrinter DebugInfoDWARF MC Object Support
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
new file mode 100644
index 000000000000..e6a90cf8a3cf
--- /dev/null
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -0,0 +1,268 @@
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Options.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include <memory>
+#include <list>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace cl;
+
+OptionCategory DwpCategory("Specific Options");
+static list<std::string> InputFiles(Positional, OneOrMore,
+                                    desc("<input files>"), cat(DwpCategory));
+
+static opt<std::string> OutputFilename(Required, "o", desc("Specify the output file."),
+                                      value_desc("filename"), cat(DwpCategory));
+
+static int error(const Twine &Error, const Twine &Context) {
+  errs() << Twine("while processing ") + Context + ":\n";
+  errs() << Twine("error: ") + Error + "\n";
+  return 1;
+}
+
+static std::error_code
+writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
+                       uint32_t &StringOffset, MCSection *StrSection,
+                       MCSection *StrOffsetSection, StringRef CurStrSection,
+                       StringRef CurStrOffsetSection) {
+  // Could possibly produce an error or warning if one of these was non-null but
+  // the other was null.
+  if (CurStrSection.empty() || CurStrOffsetSection.empty())
+    return std::error_code();
+
+  DenseMap<uint32_t, uint32_t> OffsetRemapping;
+
+  DataExtractor Data(CurStrSection, true, 0);
+  uint32_t LocalOffset = 0;
+  uint32_t PrevOffset = 0;
+  while (const char *s = Data.getCStr(&LocalOffset)) {
+    StringRef Str(s, LocalOffset - PrevOffset - 1);
+    auto Pair = Strings.insert(std::make_pair(Str, StringOffset));
+    if (Pair.second) {
+      Out.SwitchSection(StrSection);
+      Out.EmitBytes(
+          StringRef(Pair.first->getKeyData(), Pair.first->getKeyLength() + 1));
+      StringOffset += Str.size() + 1;
+    }
+    OffsetRemapping[PrevOffset] = Pair.first->second;
+    PrevOffset = LocalOffset;
+  }
+
+  Data = DataExtractor(CurStrOffsetSection, true, 0);
+
+  Out.SwitchSection(StrOffsetSection);
+
+  uint32_t Offset = 0;
+  uint64_t Size = CurStrOffsetSection.size();
+  while (Offset < Size) {
+    auto OldOffset = Data.getU32(&Offset);
+    auto NewOffset = OffsetRemapping[OldOffset];
+    Out.EmitIntValue(NewOffset, 4);
+  }
+
+  return std::error_code();
+}
+
+static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
+  const auto &MCOFI = *Out.getContext().getObjectFileInfo();
+  MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
+  MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection();
+  const StringMap<std::pair<MCSection *, DWARFSectionKind>> KnownSections = {
+      {"debug_info.dwo", {MCOFI.getDwarfInfoDWOSection(), DW_SECT_INFO}},
+      {"debug_types.dwo", {MCOFI.getDwarfTypesDWOSection(), DW_SECT_TYPES}},
+      {"debug_str_offsets.dwo", {StrOffsetSection, DW_SECT_STR_OFFSETS}},
+      {"debug_str.dwo", {StrSection, static_cast<DWARFSectionKind>(0)}},
+      {"debug_loc.dwo", {MCOFI.getDwarfLocDWOSection(), DW_SECT_LOC}},
+      {"debug_abbrev.dwo", {MCOFI.getDwarfAbbrevDWOSection(), DW_SECT_ABBREV}}};
+
+  struct UnitIndexEntry {
+    uint64_t Signature;
+    DWARFUnitIndex::Entry::SectionContribution Contributions[8];
+  };
+
+  std::vector<UnitIndexEntry> IndexEntries;
+
+  StringMap<uint32_t> Strings;
+  uint32_t StringOffset = 0;
+
+  uint64_t UnitIndex = 0;
+  uint32_t ContributionOffsets[8] = {};
+
+  for (const auto &Input : Inputs) {
+    auto ErrOrObj = object::ObjectFile::createObjectFile(Input);
+    if (!ErrOrObj)
+      return ErrOrObj.getError();
+
+    IndexEntries.emplace_back();
+    UnitIndexEntry &CurEntry = IndexEntries.back();
+    CurEntry.Signature = UnitIndex++;
+
+    StringRef CurStrSection;
+    StringRef CurStrOffsetSection;
+
+    for (const auto &Section : ErrOrObj->getBinary()->sections()) {
+      StringRef Name;
+      if (std::error_code Err = Section.getName(Name))
+        return Err;
+
+      auto SectionPair =
+          KnownSections.find(Name.substr(Name.find_first_not_of("._")));
+      if (SectionPair == KnownSections.end())
+        continue;
+
+      StringRef Contents;
+      if (auto Err = Section.getContents(Contents))
+        return Err;
+
+      if (DWARFSectionKind Kind = SectionPair->second.second) {
+        auto Index = Kind - DW_SECT_INFO;
+        CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
+        ContributionOffsets[Index] +=
+            (CurEntry.Contributions[Index].Length = Contents.size());
+      }
+
+      MCSection *OutSection = SectionPair->second.first;
+      if (OutSection == StrOffsetSection)
+        CurStrOffsetSection = Contents;
+      else if (OutSection == StrSection)
+        CurStrSection = Contents;
+      else {
+        Out.SwitchSection(OutSection);
+        Out.EmitBytes(Contents);
+      }
+    }
+
+    if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
+                                          StrSection, StrOffsetSection,
+                                          CurStrSection, CurStrOffsetSection))
+      return Err;
+  }
+
+  unsigned Columns = 0;
+  for (auto &C : ContributionOffsets)
+    if (C)
+      ++Columns;
+
+  Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
+  Out.EmitIntValue(2, 4);                   // Version
+  Out.EmitIntValue(Columns, 4);             // Columns
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
+  // FIXME: This is not the right number of buckets for a real hash.
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Buckets
+
+  // Write the signatures.
+  for (const auto &E : IndexEntries)
+    Out.EmitIntValue(E.Signature, 8);
+
+  // Write the indexes.
+  for (size_t i = 0; i != IndexEntries.size(); ++i)
+    Out.EmitIntValue(i + 1, 4);
+
+  // Write the column headers (which sections will appear in the table)
+  for (size_t i = 0; i != array_lengthof(ContributionOffsets); ++i)
+    if (ContributionOffsets[i])
+      Out.EmitIntValue(i + DW_SECT_INFO, 4);
+
+  // Write the offsets.
+  for (const auto &E : IndexEntries)
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].Offset, 4);
+
+  // Write the lengths.
+  for (const auto &E : IndexEntries)
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].Length, 4);
+
+  return std::error_code();
+}
+
+int main(int argc, char** argv) {
+
+  ParseCommandLineOptions(argc, argv, "merge split dwarf (.dwo) files");
+
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+
+  std::string ErrorStr;
+  StringRef Context = "dwarf streamer init";
+
+  Triple TheTriple("x86_64-linux-gnu");
+
+  // Get the target.
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget("", TheTriple, ErrorStr);
+  if (!TheTarget)
+    return error(ErrorStr, Context);
+  std::string TripleName = TheTriple.getTriple();
+
+  // Create all the MC Objects.
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI)
+    return error(Twine("no register info for target ") + TripleName, Context);
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  if (!MAI)
+    return error("no asm info for target " + TripleName, Context);
+
+  MCObjectFileInfo MOFI;
+  MCContext MC(MAI.get(), MRI.get(), &MOFI);
+  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default,
+                             MC);
+
+  auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "");
+  if (!MAB)
+    return error("no asm backend for target " + TripleName, Context);
+
+  std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  if (!MII)
+    return error("no instr info info for target " + TripleName, Context);
+
+  std::unique_ptr<MCSubtargetInfo> MSTI(
+      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  if (!MSTI)
+    return error("no subtarget info for target " + TripleName, Context);
+
+  MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, MC);
+  if (!MCE)
+    return error("no code emitter for target " + TripleName, Context);
+
+  // Create the output file.
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::F_None);
+  if (EC)
+    return error(Twine(OutputFilename) + ": " + EC.message(), Context);
+
+  std::unique_ptr<MCStreamer> MS(TheTarget->createMCObjectStreamer(
+      TheTriple, MC, *MAB, OutFile, MCE, *MSTI, false,
+      /*DWARFMustBeAtTheEnd*/ false));
+  if (!MS)
+    return error("no object streamer for target " + TripleName, Context);
+
+  if (auto Err = write(*MS, InputFiles))
+    return error(Err.message(), "Writing DWP file");
+
+  MS->Finish();
+}
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 984dfdcf70e5..6f90f4056f70 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -198,7 +198,10 @@ static bool importFunctions(const char *argv0, LLVMContext &Context,
     }
 
     // Link in the specified function.
-    if (L.linkInModule(M.get(), Linker::Flags::None, Index.get(), F))
+    DenseSet<const GlobalValue *> FunctionsToImport;
+    FunctionsToImport.insert(F);
+    if (L.linkInModule(*M, Linker::Flags::None, Index.get(),
+                       &FunctionsToImport))
       return false;
   }
   return true;
@@ -226,7 +229,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     std::unique_ptr<FunctionInfoIndex> Index;
     if (!FunctionIndex.empty()) {
       ErrorOr<std::unique_ptr<FunctionInfoIndex>> IndexOrErr =
-          llvm::getFunctionIndexForFile(FunctionIndex, diagnosticHandler, &*M);
+          llvm::getFunctionIndexForFile(FunctionIndex, diagnosticHandler);
       std::error_code EC = IndexOrErr.getError();
       if (EC) {
         errs() << EC.message() << '\n';
@@ -238,7 +241,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     if (Verbose)
       errs() << "Linking in '" << File << "'\n";
 
-    if (L.linkInModule(M.get(), ApplicableFlags, Index.get()))
+    if (L.linkInModule(*M, ApplicableFlags, Index.get()))
       return false;
     // All linker flags apply to linking of subsequent files.
     ApplicableFlags = Flags;
@@ -266,7 +269,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   auto Composite = make_unique<Module>("llvm-link", Context);
-  Linker L(Composite.get(), diagnosticHandler);
+  Linker L(*Composite, diagnosticHandler);
 
   unsigned Flags = Linker::Flags::None;
   if (Internalize)
diff --git a/tools/llvm-pdbdump/LinePrinter.cpp b/tools/llvm-pdbdump/LinePrinter.cpp
index 4f3ee54c7691..a43727f02b5e 100644
--- a/tools/llvm-pdbdump/LinePrinter.cpp
+++ b/tools/llvm-pdbdump/LinePrinter.cpp
@@ -11,15 +11,12 @@
 
 #include "llvm-pdbdump.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Regex.h"
 
 #include <algorithm>
 
 namespace {
-template <class T, class Pred> bool any_of_range(T &&R, Pred P) {
-  return std::any_of(R.begin(), R.end(), P);
-}
-
 bool IsItemExcluded(llvm::StringRef Item,
                     std::list<llvm::Regex> &IncludeFilters,
                     std::list<llvm::Regex> &ExcludeFilters) {
@@ -30,10 +27,10 @@ bool IsItemExcluded(llvm::StringRef Item,
 
   // Include takes priority over exclude.  If the user specified include
   // filters, and none of them include this item, them item is gone.
-  if (!IncludeFilters.empty() && !any_of_range(IncludeFilters, match_pred))
+  if (!IncludeFilters.empty() && !any_of(IncludeFilters, match_pred))
     return true;
 
-  if (any_of_range(ExcludeFilters, match_pred))
+  if (any_of(ExcludeFilters, match_pred))
     return true;
 
   return false;
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 71e3168b686b..4e94b4375f92 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -60,5 +60,52 @@ TEST(VerifierTest, InvalidRetAttribute) {
       "Attribute 'uwtable' only applies to functions!"));
 }
 
+TEST(VerifierTest, CrossModuleRef) {
+  LLVMContext &C = getGlobalContext();
+  Module M1("M1", C);
+  Module M2("M2", C);
+  Module M3("M2", C);
+  FunctionType *FTy = FunctionType::get(Type::getInt32Ty(C), /*isVarArg=*/false);
+  Function *F1 = cast<Function>(M1.getOrInsertFunction("foo1", FTy));
+  Function *F2 = cast<Function>(M2.getOrInsertFunction("foo2", FTy));
+  Function *F3 = cast<Function>(M3.getOrInsertFunction("foo3", FTy));
+
+  BasicBlock *Entry1 = BasicBlock::Create(C, "entry", F1);
+  BasicBlock *Entry3 = BasicBlock::Create(C, "entry", F3);
+
+  // BAD: Referencing function in another module
+  CallInst::Create(F2,"call",Entry1);
+
+  // BAD: Referencing personality routine in another module
+  F3->setPersonalityFn(F2);
+
+  // Fill in the body
+  Constant *ConstZero = ConstantInt::get(Type::getInt32Ty(C), 0);
+  ReturnInst::Create(C, ConstZero, Entry1);
+  ReturnInst::Create(C, ConstZero, Entry3);
+
+  std::string Error;
+  raw_string_ostream ErrorOS(Error);
+  EXPECT_FALSE(verifyModule(M2, &ErrorOS));
+  EXPECT_TRUE(verifyModule(M1, &ErrorOS));
+  EXPECT_TRUE(StringRef(ErrorOS.str()).equals(
+      "Referencing function in another module!\n"
+      "  %call = call i32 @foo2()\n"
+      "; ModuleID = 'M1'\n"
+      "i32 ()* @foo2\n"
+      "; ModuleID = 'M2'\n"));
+
+  Error.clear();
+  EXPECT_TRUE(verifyModule(M3, &ErrorOS));
+  EXPECT_TRUE(StringRef(ErrorOS.str()).startswith(
+      "Referencing personality function in another module!"));
+
+  // Erase bad methods to avoid triggering an assertion failure on destruction
+  F1->eraseFromParent();
+  F3->eraseFromParent();
+}
+
+
+
 }
 }
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index a1d889a6b3dd..4eba718e2663 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -93,7 +93,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::LinkModules(LinkedModule, M.get());
+  Linker::linkModules(*LinkedModule, *M);
 
   // Delete the original module.
   M.reset();
@@ -169,13 +169,13 @@ static Module *getInternal(LLVMContext &Ctx) {
 TEST_F(LinkModuleTest, EmptyModule) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::LinkModules(EmptyM.get(), InternalM.get());
+  Linker::linkModules(*EmptyM, *InternalM);
 }
 
 TEST_F(LinkModuleTest, EmptyModule2) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::LinkModules(InternalM.get(), EmptyM.get());
+  Linker::linkModules(*InternalM, *EmptyM);
 }
 
 TEST_F(LinkModuleTest, TypeMerge) {
@@ -190,7 +190,7 @@ TEST_F(LinkModuleTest, TypeMerge) {
                       "@t2 = weak global %t zeroinitializer\n";
   std::unique_ptr<Module> M2 = parseAssemblyString(M2Str, Err, C);
 
-  Linker::LinkModules(M1.get(), M2.get(), [](const llvm::DiagnosticInfo &){});
+  Linker::linkModules(*M1, *M2, [](const llvm::DiagnosticInfo &) {});
 
   EXPECT_EQ(M1->getNamedGlobal("t1")->getType(),
             M1->getNamedGlobal("t2")->getType());
@@ -267,8 +267,7 @@ TEST_F(LinkModuleTest, MoveDistinctMDs) {
   // Link into destination module.
   auto Dst = llvm::make_unique<Module>("Linked", C);
   ASSERT_TRUE(Dst.get());
-  Linker::LinkModules(Dst.get(), Src.get(),
-                      [](const llvm::DiagnosticInfo &) {});
+  Linker::linkModules(*Dst, *Src, [](const llvm::DiagnosticInfo &) {});
 
   // Check that distinct metadata was moved, not cloned.  Even !4, the uniqued
   // node, should effectively be moved, since its only operand hasn't changed.
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index d50944ff77fd..635a5431a513 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -354,7 +354,7 @@ TEST_F(InstrProfTest, get_icall_data_merge1_saturation) {
   const uint64_t Max = std::numeric_limits<uint64_t>::max();
 
   InstrProfRecord Record1("caller", 0x1234, {1});
-  InstrProfRecord Record2("caller", 0x1234, {1});
+  InstrProfRecord Record2("caller", 0x1234, {Max});
   InstrProfRecord Record3("callee1", 0x1235, {3, 4});
 
   Record1.reserveSites(IPVK_IndirectCallTarget, 1);
@@ -375,6 +375,9 @@ TEST_F(InstrProfTest, get_icall_data_merge1_saturation) {
   // Verify saturation of counts.
   ErrorOr<InstrProfRecord> R = Reader->getInstrProfRecord("caller", 0x1234);
   ASSERT_TRUE(NoError(R.getError()));
+
+  ASSERT_EQ(Max, R.get().Counts[0]);
+
   ASSERT_EQ(1U, R.get().getNumValueSites(IPVK_IndirectCallTarget));
   std::unique_ptr<InstrProfValueData[]> VD =
           R.get().getValueForSite(IPVK_IndirectCallTarget, 0);
@@ -409,7 +412,7 @@ TEST_F(InstrProfTest, runtime_value_prof_data_read_write) {
   initializeValueProfRuntimeRecord(&RTRecord, &NumValueSites[0],
                                    &ValueProfNodes[0]);
 
-  ValueProfData *VPData = serializeValueProfDataFromRT(&RTRecord, 0);
+  ValueProfData *VPData = serializeValueProfDataFromRT(&RTRecord, nullptr);
 
   InstrProfRecord Record("caller", 0x1234, {1ULL << 31, 2});
 
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 4b543d3f9fc6..a954998d36e9 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -901,7 +901,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
                 break; // No conditions on this operand at all
             }
             Cond = Target.getName() + ClassName + "ValidateMCOperand(" +
-                   Op + ", " + llvm::utostr(Entry) + ")";
+                   Op + ", STI, " + llvm::utostr(Entry) + ")";
           }
           // for all subcases of ResultOperand::K_Record:
           IAP.addCond(Cond);
@@ -996,8 +996,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
   if (!MCOpPredicates.empty())
     O << "static bool " << Target.getName() << ClassName
-      << "ValidateMCOperand(\n"
-      << "       const MCOperand &MCOp, unsigned PredicateIndex);\n";
+      << "ValidateMCOperand(const MCOperand &MCOp,\n"
+      << "                  const MCSubtargetInfo &STI,\n"
+      << "                  unsigned PredicateIndex);\n";
 
   O << HeaderO.str();
   O.indent(2) << "const char *AsmString;\n";
@@ -1069,8 +1070,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
   if (!MCOpPredicates.empty()) {
     O << "static bool " << Target.getName() << ClassName
-      << "ValidateMCOperand(\n"
-      << "       const MCOperand &MCOp, unsigned PredicateIndex) {\n"
+      << "ValidateMCOperand(const MCOperand &MCOp,\n"
+      << "                  const MCSubtargetInfo &STI,\n"
+      << "                  unsigned PredicateIndex) {\n"      
       << "  switch (PredicateIndex) {\n"
       << "  default:\n"
       << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 3f74a9999c98..3ebe51e05121 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -388,7 +388,13 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // the size of the smallest type.
   {
     TypeSet InputSet(Other);
-    MVT Smallest = TypeVec[0];
+    MVT Smallest = *std::min_element(TypeVec.begin(), TypeVec.end(),
+      [](MVT A, MVT B) {
+        return A.getScalarSizeInBits() < B.getScalarSizeInBits() ||
+               (A.getScalarSizeInBits() == B.getScalarSizeInBits() &&
+                A.getSizeInBits() < B.getSizeInBits());
+      });
+
     auto I = std::remove_if(Other.TypeVec.begin(), Other.TypeVec.end(),
       [Smallest](MVT OtherVT) {
         // Don't compare vector and non-vector types.
@@ -416,7 +422,12 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // the size of the largest type.
   {
     TypeSet InputSet(*this);
-    MVT Largest = Other.TypeVec[Other.TypeVec.size()-1];
+    MVT Largest = *std::max_element(Other.TypeVec.begin(), Other.TypeVec.end(),
+      [](MVT A, MVT B) {
+        return A.getScalarSizeInBits() < B.getScalarSizeInBits() ||
+               (A.getScalarSizeInBits() == B.getScalarSizeInBits() &&
+                A.getSizeInBits() < B.getSizeInBits());
+      });
     auto I = std::remove_if(TypeVec.begin(), TypeVec.end(),
       [Largest](MVT OtherVT) {
         // Don't compare vector and non-vector types.