diff --git a/.clang-tidy b/.clang-tidy
index 3186da43d43d..97fbe23333bd 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1 +1,13 @@
-Checks: '-*,clang-diagnostic-*,llvm-*,misc-*'
+Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,readability-identifier-naming'
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           lowerCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc46413640c5..c4ff8f3cd28b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -301,6 +301,9 @@ endif( LLVM_USE_INTEL_JITEVENTS )
 option(LLVM_USE_OPROFILE
   "Use opagent JIT interface to inform OProfile about JIT code" OFF)
 
+option(LLVM_EXTERNALIZE_DEBUGINFO
+  "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
+
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 0a6f057341e8..cdd0f8dc933c 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -53,10 +53,6 @@ N: Quentin Colombet
 E: qcolombet@apple.com
 D: Register allocators
 
-N: Anshuman Dasgupta
-E: adasgupt@codeaurora.org
-D: Hexagon Backend
-
 N: Duncan P. N. Exon Smith
 E: dexonsmith@apple.com
 D: Branch weights and BlockFrequencyInfo
@@ -138,6 +134,10 @@ N: Richard Osborne
 E: richard@xmos.com
 D: XCore Backend
 
+N: Krzysztof Parzyszek
+E: kparzysz@codeaurora.org
+D: Hexagon Backend
+
 N: Chad Rosier
 E: mcrosier@codeaurora.org
 D: Fast-Isel
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 16535dfba4ab..02ab161e3b03 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1343,7 +1343,7 @@ AC_DEFINE_UNQUOTED(DEFAULT_SYSROOT,"$withval",
 AC_ARG_WITH(clang-default-openmp-runtime,
   AS_HELP_STRING([--with-clang-default-openmp-runtime],
     [The default OpenMP runtime for Clang.]),,
-    withval="libgomp")
+    withval="libomp")
 AC_DEFINE_UNQUOTED(CLANG_DEFAULT_OPENMP_RUNTIME,"$withval",
                    [Default OpenMP runtime used by -fopenmp.])
 
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index b5517d0b893d..23559d604bca 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -512,6 +512,10 @@ function(llvm_add_library name)
       add_dependencies(${objlib} ${LLVM_COMMON_DEPENDS})
     endforeach()
   endif()
+
+  if(ARG_SHARED OR ARG_MODULE)
+    llvm_externalize_debuginfo(${name})
+  endif()
 endfunction()
 
 macro(add_llvm_library name)
@@ -595,7 +599,7 @@ endmacro(add_llvm_loadable_module name)
 
 
 macro(add_llvm_executable name)
-  cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB" "" "" ${ARGN})
+  cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO" "" "" ${ARGN})
   llvm_process_sources( ALL_FILES ${ARG_UNPARSED_ARGUMENTS} )
 
   # Generate objlib
@@ -655,14 +659,18 @@ macro(add_llvm_executable name)
   if( LLVM_COMMON_DEPENDS )
     add_dependencies( ${name} ${LLVM_COMMON_DEPENDS} )
   endif( LLVM_COMMON_DEPENDS )
+
+  if(NOT ARG_IGNORE_EXTERNALIZE_DEBUGINFO)
+    llvm_externalize_debuginfo(${name})
+  endif()
 endmacro(add_llvm_executable name)
 
 function(export_executable_symbols target)
   if (NOT MSVC) # MSVC's linker doesn't support exporting all symbols.
     set_target_properties(${target} PROPERTIES ENABLE_EXPORTS 1)
     if (APPLE)
-      set_target_properties(${target} PROPERTIES
-        LINK_FLAGS "-rdynamic")
+      set_property(TARGET ${target} APPEND_STRING PROPERTY
+        LINK_FLAGS " -rdynamic")
     endif()
   endif()
 endfunction()
@@ -870,7 +878,7 @@ function(add_unittest test_suite test_name)
 
   set(LLVM_REQUIRES_RTTI OFF)
 
-  add_llvm_executable(${test_name} ${ARGN})
+  add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN})
   set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
   set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
   target_link_libraries(${test_name}
@@ -1168,3 +1176,24 @@ function(add_llvm_tool_symlink name dest)
     endif()
   endif()
 endfunction()
+
+function(llvm_externalize_debuginfo name)
+  if(NOT LLVM_EXTERNALIZE_DEBUGINFO)
+    return()
+  endif()
+
+  if(APPLE)
+    if(CMAKE_CXX_FLAGS MATCHES "-flto"
+      OR CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE} MATCHES "-flto")
+
+      set(lto_object ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${name}-lto.o)
+      set_property(TARGET ${name} APPEND_STRING PROPERTY
+        LINK_FLAGS " -Wl,-object_path_lto,${lto_object}")
+    endif()
+    add_custom_command(TARGET ${name} POST_BUILD
+      COMMAND xcrun dsymutil $<TARGET_FILE:${name}>
+      COMMAND xcrun strip -Sl $<TARGET_FILE:${name}>)
+  else()
+    message(FATAL_ERROR "LLVM_EXTERNALIZE_DEBUGINFO isn't implemented for non-darwin platforms!")
+  endif()
+endfunction()
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 9e4c4c5078af..f4ac6d7a73fd 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -317,6 +317,9 @@ if( MSVC )
     -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
     -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
     -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
+        # C4592 is disabled because of false positives in Visual Studio 2015
+        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
+    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
 
 	# Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
 	# support the 'aligned' attribute in the way that clang sources requires (for
@@ -584,6 +587,14 @@ if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
   message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
 endif()
 
+option(LLVM_BUILD_INSTRUMENTED "Build LLVM and tools with PGO instrumentation (experimental)" Off)
+mark_as_advanced(LLVM_BUILD_INSTRUMENTED)
+append_if(LLVM_BUILD_INSTRUMENTED "-fprofile-instr-generate"
+  CMAKE_CXX_FLAGS
+  CMAKE_C_FLAGS
+  CMAKE_EXE_LINKER_FLAGS
+  CMAKE_SHARED_LINKER_FLAGS)
+
 # Plugin support
 # FIXME: Make this configurable.
 if(WIN32 OR CYGWIN)
diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake
index 5d8fb71e08b3..c2d9f530c200 100644
--- a/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -95,7 +95,7 @@ function(llvm_ExternalProject_Add name source_dir)
     if(variableName MATCHES "^${nameCanon}")
       string(REPLACE ";" "\;" value "${${variableName}}")
       list(APPEND PASSTHROUGH_VARIABLES
-        -D${variableName}=${${value}})
+        -D${variableName}=${value})
     endif()
   endforeach()
 
diff --git a/configure b/configure
index 1b3d08b3ace2..33438c60365e 100755
--- a/configure
+++ b/configure
@@ -5954,7 +5954,7 @@ _ACEOF
 if test "${with_clang_default_openmp_runtime+set}" = set; then
   withval=$with_clang_default_openmp_runtime;
 else
-  withval="libgomp"
+  withval="libomp"
 fi
 
 
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 62d66f85d557..d6e3099bdb63 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -756,6 +756,7 @@ function. The operand fields are:
   * ``anyregcc``: code 13
   * ``preserve_mostcc``: code 14
   * ``preserve_allcc``: code 15
+  * ``cxx_fast_tlscc``: code 17
   * ``x86_stdcallcc``: code 64
   * ``x86_fastcallcc``: code 65
   * ``arm_apcscc``: code 66
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 900ba24e230f..6c3ff4b10f1e 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -22,14 +22,16 @@ ARM
 
 * `ABI Addenda and Errata <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf>`_
 
-* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053a/IHI0053A_acle.pdf>`_
+* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf>`_
 
 AArch64
 -------
 
+* `ARMv8 Architecture Reference Manual <http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.h/index.html>`_
+
 * `ARMv8 Instruction Set Overview <http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.genc010197a/index.html>`_
 
-* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053a/IHI0053A_acle.pdf>`_
+* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf>`_
 
 Itanium (ia64)
 --------------
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index ea498dde459a..17baf2d27b13 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -525,6 +525,33 @@ for llvm users and not imposing a big burden on llvm developers:
   it is to drop it. That is not very user friendly and a bit more effort is
   expected, but no promises are made.
 
+C API Changes
+----------------
+
+* Stability Guarantees: The C API is, in general, a "best effort" for stability.
+  This means that we make every attempt to keep the C API stable, but that
+  stability will be limited by the abstractness of the interface and the
+  stability of the C++ API that it wraps. In practice, this means that things
+  like "create debug info" or "create this type of instruction" are likely to be
+  less stable than "take this IR file and JIT it for my current machine".
+
+* Release stability: We won't break the C API on the release branch with patches
+  that go on that branch, with the exception that we will fix an unintentional
+  C API break that will keep the release consistent with both the previous and
+  next release.
+
+* Testing: Patches to the C API are expected to come with tests just like any
+  other patch.
+
+* Including new things into the API: If an LLVM subcomponent has a C API already
+  included, then expanding that C API is acceptable. Adding C API for
+  subcomponents that don't currently have one needs to be discussed on the
+  mailing list for design and maintainability feedback prior to implementation.
+
+* Documentation: Any changes to the C API are required to be documented in the
+  release notes so that it's clear to external users who do not follow the
+  project how the C API is changing and evolving.
+
 .. _copyright-license-patents:
 
 Copyright, License, and Patents
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 0d3115513453..c0741f1400c5 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -55,6 +55,12 @@ Here's the short story for getting up and running quickly with LLVM:
    * ``cd llvm/projects``
    * ``svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt``
 
+#. Checkout Libomp (required for OpenMP support):
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/projects``
+   * ``svn co http://llvm.org/svn/llvm-project/openmp/trunk openmp``
+
 #. Checkout libcxx and libcxxabi **[Optional]**:
 
    * ``cd where-you-want-llvm-to-live``
@@ -535,6 +541,13 @@ If you want to check out compiler-rt (required to build the sanitizers), run:
   % cd llvm/projects
   % git clone http://llvm.org/git/compiler-rt.git
 
+If you want to check out libomp (required for OpenMP support), run:
+
+.. code-block:: console
+
+  % cd llvm/projects
+  % git clone http://llvm.org/git/openmp.git
+
 If you want to check out libcxx and libcxxabi (optional), run:
 
 .. code-block:: console
@@ -634,7 +647,7 @@ To set up clone from which you can submit code using ``git-svn``, run:
   % git config svn-remote.svn.fetch :refs/remotes/origin/master
   % git svn rebase -l
 
-Likewise for compiler-rt and test-suite.
+Likewise for compiler-rt, libomp and test-suite.
 
 To update this clone without generating git-svn tags that conflict with the
 upstream Git repo, run:
@@ -648,7 +661,7 @@ upstream Git repo, run:
      git checkout master &&
      git svn rebase -l)
 
-Likewise for compiler-rt and test-suite.
+Likewise for compiler-rt, libomp and test-suite.
 
 This leaves your working directories on their master branches, so you'll need to
 ``checkout`` each working branch individually and ``rebase`` it on top of its
@@ -853,7 +866,7 @@ with the latest Xcode:
 
 .. code-block:: console
 
-  % cmake -G "Ninja" -DCMAKE_OSX_ARCHITECTURES=“armv7;armv7s;arm64"
+  % cmake -G "Ninja" -DCMAKE_OSX_ARCHITECTURES="armv7;armv7s;arm64"
     -DCMAKE_TOOLCHAIN_FILE=<PATH_TO_LLVM>/cmake/platforms/iOS.cmake
     -DCMAKE_BUILD_TYPE=Release -DLLVM_BUILD_RUNTIME=Off -DLLVM_INCLUDE_TESTS=Off
     -DLLVM_INCLUDE_EXAMPLES=Off -DLLVM_ENABLE_BACKTRACES=Off [options]
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index cf1ceab1f1c6..7f1a97428ee2 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -406,6 +406,26 @@ added in the future:
     This calling convention, like the `PreserveMost` calling convention, will be
     used by a future version of the ObjectiveC runtime and should be considered
     experimental at this time.
+"``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
+    Clang generates an access function to access C++-style TLS. The access
+    function generally has an entry block, an exit block and an initialization
+    block that is run at the first time. The entry and exit blocks can access
+    a few TLS IR variables, each access will be lowered to a platform-specific
+    sequence.
+
+    This calling convention aims to minimize overhead in the caller by
+    preserving as many registers as possible (all the registers that are
+    perserved on the fast path, composed of the entry and exit blocks).
+
+    This calling convention behaves identical to the `C` calling convention on
+    how arguments and return values are passed, but it uses a different set of
+    caller/callee-saved registers.
+
+    Given that each platform has its own lowering sequence, hence its own set
+    of preserved registers, we can't use the existing `PreserveMost`.
+
+    - On X86-64 the callee preserves all general purpose registers, except for
+      RDI and RAX.
 "``cc <n>``" - Numbered convention
     Any calling convention may be specified by number, allowing
     target-specific calling conventions to be used. Target specific
@@ -3731,9 +3751,9 @@ DICompileUnit
 """""""""""""
 
 ``DICompileUnit`` nodes represent a compile unit. The ``enums:``,
-``retainedTypes:``, ``subprograms:``, ``globals:`` and ``imports:`` fields are
-tuples containing the debug info to be emitted along with the compile unit,
-regardless of code optimizations (some nodes are only emitted if there are
+``retainedTypes:``, ``subprograms:``, ``globals:``, ``imports:`` and ``macros:``
+fields are tuples containing the debug info to be emitted along with the compile
+unit, regardless of code optimizations (some nodes are only emitted if there are
 references to them from instructions).
 
 .. code-block:: llvm
@@ -3742,7 +3762,7 @@ references to them from instructions).
                         isOptimized: true, flags: "-O2", runtimeVersion: 2,
                         splitDebugFilename: "abc.debug", emissionKind: 1,
                         enums: !2, retainedTypes: !3, subprograms: !4,
-                        globals: !5, imports: !6)
+                        globals: !5, imports: !6, macros: !7, dwoId: 0x0abcd)
 
 Compile unit descriptors provide the root scope for objects declared in a
 specific compilation unit. File descriptors are defined using this scope.
@@ -4108,6 +4128,32 @@ compile unit.
    !2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
                           entity: !1, line: 7)
 
+DIMacro
+"""""""
+
+``DIMacro`` nodes represent definition or undefinition of a macro identifiers.
+The ``name:`` field is the macro identifier, followed by macro parameters when
+definining a function-like macro, and the ``value`` field is the token-string
+used to expand the macro identifier.
+
+.. code-block:: llvm
+
+   !2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)",
+                 value: "((x) + 1)")
+   !3 = !DIMacro(macinfo: DW_MACINFO_undef, line: 30, name: "foo")
+
+DIMacroFile
+"""""""""""
+
+``DIMacroFile`` nodes represent inclusion of source files.
+The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that
+appear in the included source file.
+
+.. code-block:: llvm
+
+   !2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2,
+                     nodes: !3)
+
 '``tbaa``' Metadata
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst
index eb79b2e490c4..e14ea67d652c 100644
--- a/docs/LibFuzzer.rst
+++ b/docs/LibFuzzer.rst
@@ -483,6 +483,8 @@ Trophies
 
 * `Python <http://bugs.python.org/issue25388>`_
 
+* OpenSSL/BoringSSL: `[1] <https://boringssl.googlesource.com/boringssl/+/cb852981cd61733a7a1ae4fd8755b7ff950e857d>`_
+
 * `Libxml2
   <https://bugzilla.gnome.org/buglist.cgi?bug_status=__all__&content=libFuzzer&list_id=68957&order=Importance&product=libxml2&query_format=specific>`_
 
diff --git a/docs/ReleaseProcess.rst b/docs/ReleaseProcess.rst
index c4bbc91c63ce..d7f703126019 100644
--- a/docs/ReleaseProcess.rst
+++ b/docs/ReleaseProcess.rst
@@ -53,7 +53,7 @@ test-release.sh
 ---------------
 
 This script will check-out, configure and compile LLVM+Clang (+ most add-ons, like ``compiler-rt``,
-``libcxx`` and ``clang-extra-tools``) in three stages, and will test the final stage.
+``libcxx``, ``libomp`` and ``clang-extra-tools``) in three stages, and will test the final stage.
 It'll have installed the final binaries on the Phase3/Releasei(+Asserts) directory, and
 that's the one you should use for the test-suite and other external tests.
 
diff --git a/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
index 8ba76e86ee07..78184f5d32cd 100644
--- a/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
+++ b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
@@ -1308,7 +1308,7 @@ class KaleidoscopeJIT {
 
   std::map<std::string, std::unique_ptr<FunctionAST>> FunctionDefs;
 
-  JITCompileCallbackManager<OrcX86_64> CompileCallbacks;
+  LocalJITCompileCallbackManager<OrcX86_64> CompileCallbacks;
 };
 
 static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {
diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 3ab202d8d71c..bc563570c203 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -58,6 +58,8 @@ class SetVector {
     insert(Start, End);
   }
 
+  ArrayRef<T> getArrayRef() const { return vector_; }
+
   /// \brief Determine if the SetVector is empty or not.
   bool empty() const {
     return vector_.empty();
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 7cbc40f768eb..270a32621be7 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -235,7 +235,7 @@ class LazyCallGraph {
     parent_iterator parent_end() const { return ParentSCCs.end(); }
 
     iterator_range<parent_iterator> parents() const {
-      return iterator_range<parent_iterator>(parent_begin(), parent_end());
+      return make_range(parent_begin(), parent_end());
     }
 
     /// \brief Test if this SCC is a parent of \a C.
@@ -410,8 +410,7 @@ class LazyCallGraph {
   }
 
   iterator_range<postorder_scc_iterator> postorder_sccs() {
-    return iterator_range<postorder_scc_iterator>(postorder_scc_begin(),
-                                                  postorder_scc_end());
+    return make_range(postorder_scc_begin(), postorder_scc_end());
   }
 
   /// \brief Lookup a function in the graph which has already been scanned and
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 77d412a4f927..871d35e99b74 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -193,11 +193,10 @@ class MemoryDepChecker {
                const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(ScalarEvolution *Se, const Loop *L,
-                   SCEVUnionPredicate &Preds)
-      : SE(Se), InnermostLoop(L), AccessIdx(0),
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
+      : PSE(PSE), InnermostLoop(L), AccessIdx(0),
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
-        RecordDependences(true), Preds(Preds) {}
+        RecordDependences(true) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -266,7 +265,13 @@ class MemoryDepChecker {
                                                          bool isWrite) const;
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
+  /// applies dynamic knowledge to simplify SCEV expressions and convert them
+  /// to a more usable form. We need this in case assumptions about SCEV
+  /// expressions need to be made in order to avoid unknown dependences. For
+  /// example we might assume a unit stride for a pointer in order to prove
+  /// that a memory access is strided and doesn't wrap.
+  PredicatedScalarEvolution &PSE;
   const Loop *InnermostLoop;
 
   /// \brief Maps access locations (ptr, read/write) to program order.
@@ -317,15 +322,6 @@ class MemoryDepChecker {
   /// \brief Check whether the data dependence could prevent store-load
   /// forwarding.
   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The dependence checker needs this in order to convert SCEVs of pointers
-  /// to more accurate expressions in the context of existing assumptions.
-  /// We also need this in case assumptions about SCEV expressions need to
-  /// be made in order to avoid unknown dependences. For example we might
-  /// assume a unit stride for a pointer in order to prove that a memory access
-  /// is strided and doesn't wrap.
-  SCEVUnionPredicate &Preds;
 };
 
 /// \brief Holds information about the memory runtime legality checks to verify
@@ -373,7 +369,7 @@ class RuntimePointerChecking {
   /// and change \p Preds.
   void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
               unsigned ASId, const ValueToValueMap &Strides,
-              SCEVUnionPredicate &Preds);
+              PredicatedScalarEvolution &PSE);
 
   /// \brief No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
@@ -508,8 +504,8 @@ class RuntimePointerChecking {
 /// ScalarEvolution, we will generate run-time checks by emitting a
 /// SCEVUnionPredicate.
 ///
-/// Checks for both memory dependences and SCEV predicates must be emitted in
-/// order for the results of this analysis to be valid.
+/// Checks for both memory dependences and the SCEV predicates contained in the
+/// PSE must be emitted in order for the results of this analysis to be valid.
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
@@ -591,14 +587,12 @@ class LoopAccessInfo {
     return StoreToLoopInvariantAddress;
   }
 
-  /// The SCEV predicate contains all the SCEV-related assumptions.
-  /// The is used to keep track of the minimal set of assumptions on SCEV
-  /// expressions that the analysis needs to make in order to return a
-  /// meaningful result. All SCEV expressions during the analysis should be
-  /// re-written (and therefore simplified) according to Preds.
+  /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
+  /// them to a more usable form.  All SCEV expressions during the analysis
+  /// should be re-written (and therefore simplified) according to PSE.
   /// A user of LoopAccessAnalysis will need to emit the runtime checks
   /// associated with this predicate.
-  SCEVUnionPredicate Preds;
+  PredicatedScalarEvolution PSE;
 
 private:
   /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
@@ -619,7 +613,6 @@ class LoopAccessInfo {
   MemoryDepChecker DepChecker;
 
   Loop *TheLoop;
-  ScalarEvolution *SE;
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -654,18 +647,17 @@ Value *stripIntegerCast(Value *V);
 /// If \p OrigPtr is not null, use it to look up the stride value instead of \p
 /// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
 /// stride as collected by LoopVectorizationLegality::collectStridedAccess.
-const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                       const ValueToValueMap &PtrToStride,
-                                      SCEVUnionPredicate &Preds, Value *Ptr,
-                                      Value *OrigPtr = nullptr);
+                                      Value *Ptr, Value *OrigPtr = nullptr);
 
 /// \brief Check the stride of the pointer and ensure that it does not wrap in
 /// the address space, assuming \p Preds is true.
 ///
 /// If necessary this method will version the stride of the pointer according
 /// to \p PtrToStride and therefore add a new predicate to \p Preds.
-int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                 const ValueToValueMap &StridesMap, SCEVUnionPredicate &Preds);
+int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap);
 
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index ac0a4b02f445..57695b46d640 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -37,6 +37,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include <algorithm>
 
@@ -141,7 +142,7 @@ class LoopBase {
   block_iterator block_begin() const { return Blocks.begin(); }
   block_iterator block_end() const { return Blocks.end(); }
   inline iterator_range<block_iterator> blocks() const {
-    return iterator_range<block_iterator>(block_begin(), block_end());
+    return make_range(block_begin(), block_end());
   }
 
   /// getNumBlocks - Get the number of blocks in this loop in constant time.
@@ -401,6 +402,9 @@ class Loop : public LoopBase<BasicBlock, Loop> {
   /// isLCSSAForm - Return true if the Loop is in LCSSA form
   bool isLCSSAForm(DominatorTree &DT) const;
 
+  /// \brief Return true if this Loop and all inner subloops are in LCSSA form.
+  bool isRecursivelyLCSSAForm(DominatorTree &DT) const;
+
   /// isLoopSimplifyForm - Return true if the Loop is in the form that
   /// the LoopSimplify form transforms loops to, which is sometimes called
   /// normal form.
@@ -681,6 +685,78 @@ class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
     // it as a replacement will not break LCSSA form.
     return ToLoop->contains(getLoopFor(From->getParent()));
   }
+
+  /// \brief Checks if moving a specific instruction can break LCSSA in any
+  /// loop.
+  ///
+  /// Return true if moving \p Inst to before \p NewLoc will break LCSSA,
+  /// assuming that the function containing \p Inst and \p NewLoc is currently
+  /// in LCSSA form.
+  bool movementPreservesLCSSAForm(Instruction *Inst, Instruction *NewLoc) {
+    assert(Inst->getFunction() == NewLoc->getFunction() &&
+           "Can't reason about IPO!");
+
+    auto *OldBB = Inst->getParent();
+    auto *NewBB = NewLoc->getParent();
+
+    // Movement within the same loop does not break LCSSA (the equality check is
+    // to avoid doing a hashtable lookup in case of intra-block movement).
+    if (OldBB == NewBB)
+      return true;
+
+    auto *OldLoop = getLoopFor(OldBB);
+    auto *NewLoop = getLoopFor(NewBB);
+
+    if (OldLoop == NewLoop)
+      return true;
+
+    // Check if Outer contains Inner; with the null loop counting as the
+    // "outermost" loop.
+    auto Contains = [](const Loop *Outer, const Loop *Inner) {
+      return !Outer || Outer->contains(Inner);
+    };
+
+    // To check that the movement of Inst to before NewLoc does not break LCSSA,
+    // we need to check two sets of uses for possible LCSSA violations at
+    // NewLoc: the users of NewInst, and the operands of NewInst.
+
+    // If we know we're hoisting Inst out of an inner loop to an outer loop,
+    // then the uses *of* Inst don't need to be checked.
+
+    if (!Contains(NewLoop, OldLoop)) {
+      for (Use &U : Inst->uses()) {
+        auto *UI = cast<Instruction>(U.getUser());
+        auto *UBB = isa<PHINode>(UI) ? cast<PHINode>(UI)->getIncomingBlock(U)
+                                     : UI->getParent();
+        if (UBB != NewBB && getLoopFor(UBB) != NewLoop)
+          return false;
+      }
+    }
+
+    // If we know we're sinking Inst from an outer loop into an inner loop, then
+    // the *operands* of Inst don't need to be checked.
+
+    if (!Contains(OldLoop, NewLoop)) {
+      // See below on why we can't handle phi nodes here.
+      if (isa<PHINode>(Inst))
+        return false;
+
+      for (Use &U : Inst->operands()) {
+        auto *DefI = dyn_cast<Instruction>(U.get());
+        if (!DefI)
+          return false;
+
+        // This would need adjustment if we allow Inst to be a phi node -- the
+        // new use block won't simply be NewBB.
+
+        auto *DefBlock = DefI->getParent();
+        if (DefBlock != NewBB && getLoopFor(DefBlock) != NewLoop)
+          return false;
+      }
+    }
+
+    return true;
+  }
 };
 
 // Allow clients to walk the list of nested loops...
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index f674cc7ee56f..15565daf6d3b 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1324,6 +1324,59 @@ namespace llvm {
     void print(raw_ostream &OS, const Module * = nullptr) const override;
     void verifyAnalysis() const override;
   };
+
+  /// An interface layer with SCEV used to manage how we see SCEV expressions
+  /// for values in the context of existing predicates. We can add new
+  /// predicates, but we cannot remove them.
+  ///
+  /// This layer has multiple purposes:
+  ///   - provides a simple interface for SCEV versioning.
+  ///   - guarantees that the order of transformations applied on a SCEV
+  ///     expression for a single Value is consistent across two different
+  ///     getSCEV calls. This means that, for example, once we've obtained
+  ///     an AddRec expression for a certain value through expression
+  ///     rewriting, we will continue to get an AddRec expression for that
+  ///     Value.
+  ///   - lowers the number of expression rewrites.
+  class PredicatedScalarEvolution {
+  public:
+    PredicatedScalarEvolution(ScalarEvolution &SE);
+    const SCEVUnionPredicate &getUnionPredicate() const;
+    /// \brief Returns the SCEV expression of V, in the context of the current
+    /// SCEV predicate.
+    /// The order of transformations applied on the expression of V returned
+    /// by ScalarEvolution is guaranteed to be preserved, even when adding new
+    /// predicates.
+    const SCEV *getSCEV(Value *V);
+    /// \brief Adds a new predicate.
+    void addPredicate(const SCEVPredicate &Pred);
+    /// \brief Returns the ScalarEvolution analysis used.
+    ScalarEvolution *getSE() const { return &SE; }
+
+  private:
+    /// \brief Increments the version number of the predicate.
+    /// This needs to be called every time the SCEV predicate changes.
+    void updateGeneration();
+    /// Holds a SCEV and the version number of the SCEV predicate used to
+    /// perform the rewrite of the expression.
+    typedef std::pair<unsigned, const SCEV *> RewriteEntry;
+    /// Maps a SCEV to the rewrite result of that SCEV at a certain version
+    /// number. If this number doesn't match the current Generation, we will
+    /// need to do a rewrite. To preserve the transformation order of previous
+    /// rewrites, we will rewrite the previous result instead of the original
+    /// SCEV.
+    DenseMap<const SCEV *, RewriteEntry> RewriteMap;
+    /// The ScalarEvolution analysis.
+    ScalarEvolution &SE;
+    /// The SCEVPredicate that forms our context. We will rewrite all
+    /// expressions assuming that this predicate true.
+    SCEVUnionPredicate Preds;
+    /// Marks the version of the SCEV predicate used. When rewriting a SCEV
+    /// expression we mark it with the version of the predicate. We use this to
+    /// figure out if the predicate has changed from the last rewrite of the
+    /// SCEV. If so, we need to perform a new rewrite.
+    unsigned Generation;
+  };
 }
 
 #endif
diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index 393e9d6695f8..7798e3c88248 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -27,6 +27,86 @@
 #define TLI_DEFINE_STRING_INTERNAL(string_repr) string_repr,
 #endif
 
+/// void *new(unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_int)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPAXI@Z")
+
+/// void *new(unsigned int, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_int_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPAXIABUnothrow_t@std@@@Z")
+
+/// void *new(unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_longlong)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPEAX_K@Z")
+
+/// void *new(unsigned long long, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_longlong_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPEAX_KAEBUnothrow_t@std@@@Z")
+
+/// void operator delete(void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAX@Z")
+
+/// void operator delete(void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAXABUnothrow_t@std@@@Z")
+
+/// void operator delete(void*, unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32_int)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAXI@Z")
+
+/// void operator delete(void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAX@Z")
+
+/// void operator delete(void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAXAEBUnothrow_t@std@@@Z")
+
+/// void operator delete(void*, unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64_longlong)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAX_K@Z")
+
+/// void *new[](unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_int)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPAXI@Z")
+
+/// void *new[](unsigned int, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_int_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPAXIABUnothrow_t@std@@@Z")
+
+/// void *new[](unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_longlong)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPEAX_K@Z")
+
+/// void *new[](unsigned long long, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_longlong_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAX@Z")
+
+/// void operator delete[](void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAXABUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*, unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32_int)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAXI@Z")
+
+/// void operator delete[](void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAX@Z")
+
+/// void operator delete[](void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAXAEBUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*, unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64_longlong)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAX_K@Z")
+
 /// int _IO_getc(_IO_FILE * __fp);
 TLI_DEFINE_ENUM_INTERNAL(under_IO_getc)
 TLI_DEFINE_STRING_INTERNAL("_IO_getc")
@@ -673,6 +753,7 @@ TLI_DEFINE_STRING_INTERNAL("modff")
 /// long double modfl(long double value, long double *iptr);
 TLI_DEFINE_ENUM_INTERNAL(modfl)
 TLI_DEFINE_STRING_INTERNAL("modfl")
+
 /// double nearbyint(double x);
 TLI_DEFINE_ENUM_INTERNAL(nearbyint)
 TLI_DEFINE_STRING_INTERNAL("nearbyint")
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 98458f1c3f3b..35c756b362d6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -42,11 +42,13 @@ class Value;
 /// \brief Information about a load/store intrinsic defined by the target.
 struct MemIntrinsicInfo {
   MemIntrinsicInfo()
-      : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
+      : ReadMem(false), WriteMem(false), IsSimple(false), MatchingId(0),
         NumMemRefs(0), PtrVal(nullptr) {}
   bool ReadMem;
   bool WriteMem;
-  bool Vol;
+  /// True only if this memory operation is non-volatile, non-atomic, and
+  /// unordered.  (See LoadInst/StoreInst for details on each)
+  bool IsSimple;
   // Same Id is set by the target for corresponding load/store intrinsics.
   unsigned short MatchingId;
   int NumMemRefs;
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index b34d6bac1f34..8e0291068472 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -286,7 +286,7 @@ namespace llvm {
 
   /// Returns true if the result or effects of the given instructions \p I
   /// depend on or influence global memory.
-  /// Memory dependence arises for example if the the instruction reads from
+  /// Memory dependence arises for example if the instruction reads from
   /// memory or may produce effects or undefined behaviour. Memory dependent
   /// instructions generally cannot be reorderd with respect to other memory
   /// dependent instructions or moved into non-dominated basic blocks.
@@ -412,6 +412,11 @@ namespace llvm {
     bool Ordered;               /// When implementing this min/max pattern as
                                 /// fcmp; select, does the fcmp have to be
                                 /// ordered?
+
+    /// \brief Return true if \p SPF is a min or a max pattern.
+    static bool isMinOrMax(SelectPatternFlavor SPF) {
+      return !(SPF == SPF_UNKNOWN || SPF == SPF_ABS || SPF == SPF_NABS);
+    }
   };
   /// Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind
   /// and providing the out parameter results if we successfully match.
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 7cb0d7edff48..55fe05938e61 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -220,7 +220,9 @@ enum { BITCODE_CURRENT_EPOCH = 0 };
     METADATA_EXPRESSION    = 29,  // [distinct, n x element]
     METADATA_OBJC_PROPERTY = 30,  // [distinct, name, file, line, ...]
     METADATA_IMPORTED_ENTITY=31,  // [distinct, tag, scope, entity, line, name]
-    METADATA_MODULE=32,           // [distinct, scope, name, ...]
+    METADATA_MODULE        = 32,  // [distinct, scope, name, ...]
+    METADATA_MACRO         = 33,  // [distinct, macinfo, line, name, value]
+    METADATA_MACRO_FILE    = 34,  // [distinct, macinfo, line, file, ...]
   };
 
   // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index e2245e9984b8..ec311a093869 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,12 +302,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.
-      // If the type is split to multiple registers, assume that there is some
-      // overhead to this.
       // TODO: Once we have extract/insert subvector cost we need to use them.
-      if (LT.first > 1)
-        return LT.first * 2 * OpCost;
-      return LT.first * 1 * OpCost;
+      return LT.first * OpCost;
     }
 
     if (!TLI->isOperationExpand(ISD, LT.second)) {
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 9df41dd0257c..415abb90da57 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -369,7 +369,7 @@ class CCState {
   /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive
   /// registers. If this is not possible, return zero. Otherwise, return the first
   /// register of the block that were allocated, marking the entire block as allocated.
-  unsigned AllocateRegBlock(ArrayRef<uint16_t> Regs, unsigned RegsRequired) {
+  unsigned AllocateRegBlock(ArrayRef<MCPhysReg> Regs, unsigned RegsRequired) {
     if (RegsRequired > Regs.size())
       return 0;
 
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index ac87f4f901f5..57bd24ddddfe 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -272,10 +272,10 @@ class MachineBasicBlock
   }
 
   inline iterator_range<iterator> terminators() {
-    return iterator_range<iterator>(getFirstTerminator(), end());
+    return make_range(getFirstTerminator(), end());
   }
   inline iterator_range<const_iterator> terminators() const {
-    return iterator_range<const_iterator>(getFirstTerminator(), end());
+    return make_range(getFirstTerminator(), end());
   }
 
   // Machine-CFG iterators
@@ -325,16 +325,16 @@ class MachineBasicBlock
   bool                 succ_empty() const { return Successors.empty();   }
 
   inline iterator_range<pred_iterator> predecessors() {
-    return iterator_range<pred_iterator>(pred_begin(), pred_end());
+    return make_range(pred_begin(), pred_end());
   }
   inline iterator_range<const_pred_iterator> predecessors() const {
-    return iterator_range<const_pred_iterator>(pred_begin(), pred_end());
+    return make_range(pred_begin(), pred_end());
   }
   inline iterator_range<succ_iterator> successors() {
-    return iterator_range<succ_iterator>(succ_begin(), succ_end());
+    return make_range(succ_begin(), succ_end());
   }
   inline iterator_range<const_succ_iterator> successors() const {
-    return iterator_range<const_succ_iterator>(succ_begin(), succ_end());
+    return make_range(succ_begin(), succ_end());
   }
 
   // LiveIn management methods.
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 607e2781960f..07b1133b2a4a 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -296,48 +296,46 @@ class MachineInstr
   const_mop_iterator operands_end() const { return Operands + NumOperands; }
 
   iterator_range<mop_iterator> operands() {
-    return iterator_range<mop_iterator>(operands_begin(), operands_end());
+    return make_range(operands_begin(), operands_end());
   }
   iterator_range<const_mop_iterator> operands() const {
-    return iterator_range<const_mop_iterator>(operands_begin(), operands_end());
+    return make_range(operands_begin(), operands_end());
   }
   iterator_range<mop_iterator> explicit_operands() {
-    return iterator_range<mop_iterator>(
-        operands_begin(), operands_begin() + getNumExplicitOperands());
+    return make_range(operands_begin(),
+                      operands_begin() + getNumExplicitOperands());
   }
   iterator_range<const_mop_iterator> explicit_operands() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin(), operands_begin() + getNumExplicitOperands());
+    return make_range(operands_begin(),
+                      operands_begin() + getNumExplicitOperands());
   }
   iterator_range<mop_iterator> implicit_operands() {
-    return iterator_range<mop_iterator>(explicit_operands().end(),
-                                        operands_end());
+    return make_range(explicit_operands().end(), operands_end());
   }
   iterator_range<const_mop_iterator> implicit_operands() const {
-    return iterator_range<const_mop_iterator>(explicit_operands().end(),
-                                              operands_end());
+    return make_range(explicit_operands().end(), operands_end());
   }
   /// Returns a range over all explicit operands that are register definitions.
   /// Implicit definition are not included!
   iterator_range<mop_iterator> defs() {
-    return iterator_range<mop_iterator>(
-        operands_begin(), operands_begin() + getDesc().getNumDefs());
+    return make_range(operands_begin(),
+                      operands_begin() + getDesc().getNumDefs());
   }
   /// \copydoc defs()
   iterator_range<const_mop_iterator> defs() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin(), operands_begin() + getDesc().getNumDefs());
+    return make_range(operands_begin(),
+                      operands_begin() + getDesc().getNumDefs());
   }
   /// Returns a range that includes all operands that are register uses.
   /// This may include unrelated operands which are not register uses.
   iterator_range<mop_iterator> uses() {
-    return iterator_range<mop_iterator>(
-        operands_begin() + getDesc().getNumDefs(), operands_end());
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_end());
   }
   /// \copydoc uses()
   iterator_range<const_mop_iterator> uses() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin() + getDesc().getNumDefs(), operands_end());
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_end());
   }
 
   /// Returns the number of the operand iterator \p I points to.
@@ -351,10 +349,10 @@ class MachineInstr
   bool memoperands_empty() const { return NumMemRefs == 0; }
 
   iterator_range<mmo_iterator>  memoperands() {
-    return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
+    return make_range(memoperands_begin(), memoperands_end());
   }
   iterator_range<mmo_iterator> memoperands() const {
-    return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
+    return make_range(memoperands_begin(), memoperands_end());
   }
 
   /// Return true if this instruction has exactly one MachineMemOperand.
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 0ce42dda55bd..4ec3c189ae03 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -174,7 +174,7 @@ class MachineOperandIteratorBase {
     /// Defines - Reg or a super-register is defined.
     bool Defines;
 
-    /// Reads - Read or a super-register is read.
+    /// Reads - Reg or a super-register is read.
     bool Reads;
 
     /// ReadsOverlap - Reg or an overlapping register is read.
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 43b9f5203c50..acd6440eb358 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -161,6 +161,12 @@ class MachineModuleInfo : public ImmutablePass {
   bool CallsUnwindInit;
   bool HasEHFunclets;
 
+  // TODO: Ideally, what we'd like is to have a switch that allows emitting 
+  // synchronous (precise at call-sites only) CFA into .eh_frame. However,
+  // even under this switch, we'd like .debug_frame to be precise when using.
+  // -g. At this moment, there's no way to specify that some CFI directives
+  // go into .eh_frame only, while others go into .debug_frame only.
+
   /// DbgInfoAvailable - True if debugging information is available
   /// in this module.
   bool DbgInfoAvailable;
@@ -235,11 +241,6 @@ class MachineModuleInfo : public ImmutablePass {
   bool hasDebugInfo() const { return DbgInfoAvailable; }
   void setDebugInfoAvailability(bool avail) { DbgInfoAvailable = avail; }
 
-  // Returns true if we need to generate precise CFI. Currently
-  // this is equivalent to hasDebugInfo(), but if we ever implement
-  // async EH, it will require precise CFI as well.
-  bool usePreciseUnwindInfo() const { return hasDebugInfo(); }
-
   bool callsEHReturn() const { return CallsEHReturn; }
   void setCallsEHReturn(bool b) { CallsEHReturn = b; }
 
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 0a1f62006327..04191bc1b74f 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -234,7 +234,7 @@ class MachineRegisterInfo {
   static reg_iterator reg_end() { return reg_iterator(nullptr); }
 
   inline iterator_range<reg_iterator>  reg_operands(unsigned Reg) const {
-    return iterator_range<reg_iterator>(reg_begin(Reg), reg_end());
+    return make_range(reg_begin(Reg), reg_end());
   }
 
   /// reg_instr_iterator/reg_instr_begin/reg_instr_end - Walk all defs and uses
@@ -250,8 +250,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_instr_iterator>
   reg_instructions(unsigned Reg) const {
-    return iterator_range<reg_instr_iterator>(reg_instr_begin(Reg),
-                                              reg_instr_end());
+    return make_range(reg_instr_begin(Reg), reg_instr_end());
   }
 
   /// reg_bundle_iterator/reg_bundle_begin/reg_bundle_end - Walk all defs and uses
@@ -266,8 +265,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<reg_bundle_iterator> reg_bundles(unsigned Reg) const {
-    return iterator_range<reg_bundle_iterator>(reg_bundle_begin(Reg),
-                                               reg_bundle_end());
+    return make_range(reg_bundle_begin(Reg), reg_bundle_end());
   }
 
   /// reg_empty - Return true if there are no instructions using or defining the
@@ -287,8 +285,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_nodbg_iterator>
   reg_nodbg_operands(unsigned Reg) const {
-    return iterator_range<reg_nodbg_iterator>(reg_nodbg_begin(Reg),
-                                              reg_nodbg_end());
+    return make_range(reg_nodbg_begin(Reg), reg_nodbg_end());
   }
 
   /// reg_instr_nodbg_iterator/reg_instr_nodbg_begin/reg_instr_nodbg_end - Walk
@@ -305,8 +302,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_instr_nodbg_iterator>
   reg_nodbg_instructions(unsigned Reg) const {
-    return iterator_range<reg_instr_nodbg_iterator>(reg_instr_nodbg_begin(Reg),
-                                                    reg_instr_nodbg_end());
+    return make_range(reg_instr_nodbg_begin(Reg), reg_instr_nodbg_end());
   }
 
   /// reg_bundle_nodbg_iterator/reg_bundle_nodbg_begin/reg_bundle_nodbg_end - Walk
@@ -323,8 +319,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_bundle_nodbg_iterator>
   reg_nodbg_bundles(unsigned Reg) const {
-    return iterator_range<reg_bundle_nodbg_iterator>(reg_bundle_nodbg_begin(Reg),
-                                                     reg_bundle_nodbg_end());
+    return make_range(reg_bundle_nodbg_begin(Reg), reg_bundle_nodbg_end());
   }
 
   /// reg_nodbg_empty - Return true if the only instructions using or defining
@@ -342,7 +337,7 @@ class MachineRegisterInfo {
   static def_iterator def_end() { return def_iterator(nullptr); }
 
   inline iterator_range<def_iterator> def_operands(unsigned Reg) const {
-    return iterator_range<def_iterator>(def_begin(Reg), def_end());
+    return make_range(def_begin(Reg), def_end());
   }
 
   /// def_instr_iterator/def_instr_begin/def_instr_end - Walk all defs of the
@@ -358,8 +353,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<def_instr_iterator>
   def_instructions(unsigned Reg) const {
-    return iterator_range<def_instr_iterator>(def_instr_begin(Reg),
-                                              def_instr_end());
+    return make_range(def_instr_begin(Reg), def_instr_end());
   }
 
   /// def_bundle_iterator/def_bundle_begin/def_bundle_end - Walk all defs of the
@@ -374,8 +368,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<def_bundle_iterator> def_bundles(unsigned Reg) const {
-    return iterator_range<def_bundle_iterator>(def_bundle_begin(Reg),
-                                               def_bundle_end());
+    return make_range(def_bundle_begin(Reg), def_bundle_end());
   }
 
   /// def_empty - Return true if there are no instructions defining the
@@ -400,7 +393,7 @@ class MachineRegisterInfo {
   static use_iterator use_end() { return use_iterator(nullptr); }
 
   inline iterator_range<use_iterator> use_operands(unsigned Reg) const {
-    return iterator_range<use_iterator>(use_begin(Reg), use_end());
+    return make_range(use_begin(Reg), use_end());
   }
 
   /// use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the
@@ -416,8 +409,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_instr_iterator>
   use_instructions(unsigned Reg) const {
-    return iterator_range<use_instr_iterator>(use_instr_begin(Reg),
-                                              use_instr_end());
+    return make_range(use_instr_begin(Reg), use_instr_end());
   }
 
   /// use_bundle_iterator/use_bundle_begin/use_bundle_end - Walk all uses of the
@@ -432,8 +424,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<use_bundle_iterator> use_bundles(unsigned Reg) const {
-    return iterator_range<use_bundle_iterator>(use_bundle_begin(Reg),
-                                               use_bundle_end());
+    return make_range(use_bundle_begin(Reg), use_bundle_end());
   }
 
   /// use_empty - Return true if there are no instructions using the specified
@@ -462,8 +453,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_nodbg_iterator>
   use_nodbg_operands(unsigned Reg) const {
-    return iterator_range<use_nodbg_iterator>(use_nodbg_begin(Reg),
-                                              use_nodbg_end());
+    return make_range(use_nodbg_begin(Reg), use_nodbg_end());
   }
 
   /// use_instr_nodbg_iterator/use_instr_nodbg_begin/use_instr_nodbg_end - Walk
@@ -480,8 +470,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_instr_nodbg_iterator>
   use_nodbg_instructions(unsigned Reg) const {
-    return iterator_range<use_instr_nodbg_iterator>(use_instr_nodbg_begin(Reg),
-                                                    use_instr_nodbg_end());
+    return make_range(use_instr_nodbg_begin(Reg), use_instr_nodbg_end());
   }
 
   /// use_bundle_nodbg_iterator/use_bundle_nodbg_begin/use_bundle_nodbg_end - Walk
@@ -498,8 +487,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_bundle_nodbg_iterator>
   use_nodbg_bundles(unsigned Reg) const {
-    return iterator_range<use_bundle_nodbg_iterator>(use_bundle_nodbg_begin(Reg),
-                                                     use_bundle_nodbg_end());
+    return make_range(use_bundle_nodbg_begin(Reg), use_bundle_nodbg_end());
   }
 
   /// use_nodbg_empty - Return true if there are no non-Debug instructions
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 3a510084f65a..358fd5a3732a 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -228,6 +228,7 @@ class ScheduleDAGMutation {
 class ScheduleDAGMI : public ScheduleDAGInstrs {
 protected:
   AliasAnalysis *AA;
+  LiveIntervals *LIS;
   std::unique_ptr<MachineSchedStrategy> SchedImpl;
 
   /// Topo - A topological ordering for SUnits which permits fast IsReachable
@@ -255,9 +256,10 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
 public:
   ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
                 bool RemoveKillFlags)
-      : ScheduleDAGInstrs(*C->MF, C->MLI, C->LIS, RemoveKillFlags),
-        AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), CurrentTop(),
-        CurrentBottom(), NextClusterPred(nullptr), NextClusterSucc(nullptr) {
+      : ScheduleDAGInstrs(*C->MF, C->MLI, RemoveKillFlags), AA(C->AA),
+        LIS(C->LIS), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU),
+        CurrentTop(), CurrentBottom(), NextClusterPred(nullptr),
+        NextClusterSucc(nullptr) {
 #ifndef NDEBUG
     NumInstrsScheduled = 0;
 #endif
@@ -266,6 +268,9 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
   // Provide a vtable anchor
   ~ScheduleDAGMI() override;
 
+  // Returns LiveIntervals instance for use in DAG mutators and such.
+  LiveIntervals *getLIS() const { return LIS; }
+
   /// Return true if this DAG supports VReg liveness and RegPressure.
   virtual bool hasVRegLiveness() const { return false; }
 
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index e296701d8e8c..987634fb36c3 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -441,8 +441,6 @@ class RegPressureTracker {
   /// after the current position.
   SlotIndex getCurrSlot() const;
 
-  const LiveRange *getLiveRange(unsigned Reg) const;
-
   void increaseRegPressure(ArrayRef<unsigned> Regs);
   void decreaseRegPressure(ArrayRef<unsigned> Regs);
 
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1446f2ac082b..c574df094911 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -26,22 +26,32 @@ namespace llvm {
   class MachineFrameInfo;
   class MachineLoopInfo;
   class MachineDominatorTree;
-  class LiveIntervals;
   class RegPressureTracker;
   class PressureDiffs;
 
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
     unsigned VirtReg;
+    LaneBitmask LaneMask;
     SUnit *SU;
 
-    VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
+    VReg2SUnit(unsigned VReg, LaneBitmask LaneMask, SUnit *SU)
+      : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
 
     unsigned getSparseSetIndex() const {
       return TargetRegisterInfo::virtReg2Index(VirtReg);
     }
   };
 
+  /// Mapping from virtual register to SUnit including an operand index.
+  struct VReg2SUnitOperIdx : public VReg2SUnit {
+    unsigned OperandIndex;
+
+    VReg2SUnitOperIdx(unsigned VReg, LaneBitmask LaneMask,
+                      unsigned OperandIndex, SUnit *SU)
+      : VReg2SUnit(VReg, LaneMask, SU), OperandIndex(OperandIndex) {}
+  };
+
   /// Record a physical register access.
   /// For non-data-dependent uses, OpIdx == -1.
   struct PhysRegSUOper {
@@ -69,7 +79,10 @@ namespace llvm {
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2UseMap;
+  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMultiMap;
+
+  typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
+    VReg2SUnitOperIdxMultiMap;
 
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
@@ -78,9 +91,6 @@ namespace llvm {
     const MachineLoopInfo *MLI;
     const MachineFrameInfo *MFI;
 
-    /// Live Intervals provides reaching defs in preRA scheduling.
-    LiveIntervals *LIS;
-
     /// TargetSchedModel provides an interface to the machine model.
     TargetSchedModel SchedModel;
 
@@ -95,6 +105,9 @@ namespace llvm {
     /// it has taken responsibility for scheduling the terminator correctly.
     bool CanHandleTerminators;
 
+    /// Whether lane masks should get tracked.
+    bool TrackLaneMasks;
+
     /// State specific to the current scheduling region.
     /// ------------------------------------------------
 
@@ -117,7 +130,7 @@ namespace llvm {
     /// After calling BuildSchedGraph, each vreg used in the scheduling region
     /// is mapped to a set of SUnits. These include all local vreg uses, not
     /// just the uses for a singly defined vreg.
-    VReg2UseMap VRegUses;
+    VReg2SUnitMultiMap VRegUses;
 
     /// State internal to DAG building.
     /// -------------------------------
@@ -129,8 +142,12 @@ namespace llvm {
     Reg2SUnitsMap Defs;
     Reg2SUnitsMap Uses;
 
-    /// Track the last instruction in this region defining each virtual register.
-    VReg2SUnitMap VRegDefs;
+    /// Tracks the last instruction(s) in this region defining each virtual
+    /// register. There may be multiple current definitions for a register with
+    /// disjunct lanemasks.
+    VReg2SUnitMultiMap CurrentVRegDefs;
+    /// Tracks the last instructions in this region using each virtual register.
+    VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
@@ -151,14 +168,10 @@ namespace llvm {
   public:
     explicit ScheduleDAGInstrs(MachineFunction &mf,
                                const MachineLoopInfo *mli,
-                               LiveIntervals *LIS = nullptr,
                                bool RemoveKillFlags = false);
 
     ~ScheduleDAGInstrs() override {}
 
-    /// \brief Expose LiveIntervals for use in DAG mutators and such.
-    LiveIntervals *getLIS() const { return LIS; }
-
     /// \brief Get the machine model for instruction scheduling.
     const TargetSchedModel *getSchedModel() const { return &SchedModel; }
 
@@ -200,7 +213,8 @@ namespace llvm {
     /// input.
     void buildSchedGraph(AliasAnalysis *AA,
                          RegPressureTracker *RPTracker = nullptr,
-                         PressureDiffs *PDiffs = nullptr);
+                         PressureDiffs *PDiffs = nullptr,
+                         bool TrackLaneMasks = false);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -247,6 +261,12 @@ namespace llvm {
     /// Other adjustments may be made to the instruction if necessary. Return
     /// true if the operand has been deleted, false if not.
     bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+
+    /// Returns a mask for which lanes get read/written by the given (register)
+    /// machine operand.
+    LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
+
+    void collectVRegUses(SUnit *SU);
   };
 
   /// newSUnit - Creates a new SUnit and return a ptr to it.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 83464a6c9785..a21e9ae881a7 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -326,11 +326,10 @@ class SelectionDAG {
   }
 
   iterator_range<allnodes_iterator> allnodes() {
-    return iterator_range<allnodes_iterator>(allnodes_begin(), allnodes_end());
+    return make_range(allnodes_begin(), allnodes_end());
   }
   iterator_range<allnodes_const_iterator> allnodes() const {
-    return iterator_range<allnodes_const_iterator>(allnodes_begin(),
-                                                   allnodes_end());
+    return make_range(allnodes_begin(), allnodes_end());
   }
 
   /// Return the root tag of the SelectionDAG.
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 66ed5703fe11..548549ab1353 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -576,10 +576,10 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   static use_iterator use_end() { return use_iterator(nullptr); }
 
   inline iterator_range<use_iterator> uses() {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
   inline iterator_range<use_iterator> uses() const {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
 
   /// Return true if there are exactly NUSES uses of the indicated value.
@@ -651,8 +651,8 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   };
 
   iterator_range<value_op_iterator> op_values() const {
-    return iterator_range<value_op_iterator>(value_op_iterator(op_begin()),
-                                             value_op_iterator(op_end()));
+    return make_range(value_op_iterator(op_begin()),
+                      value_op_iterator(op_end()));
   }
 
   SDVTList getVTList() const {
@@ -664,7 +664,7 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   /// to which the glue operand points. Otherwise return NULL.
   SDNode *getGluedNode() const {
     if (getNumOperands() != 0 &&
-      getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
+        getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
       return getOperand(getNumOperands()-1).getNode();
     return nullptr;
   }
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index bd3a9062fb90..7b621bee259f 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -333,6 +333,8 @@ namespace llvm {
   /// This pass assigns indexes to each instruction.
   class SlotIndexes : public MachineFunctionPass {
   private:
+    // IndexListEntry allocator.
+    BumpPtrAllocator ileAllocator;
 
     typedef ilist<IndexListEntry> IndexList;
     IndexList indexList;
@@ -353,9 +355,6 @@ namespace llvm {
     /// and MBB id.
     SmallVector<IdxMBBPair, 8> idx2MBBMap;
 
-    // IndexListEntry allocator.
-    BumpPtrAllocator ileAllocator;
-
     IndexListEntry* createEntry(MachineInstr *mi, unsigned index) {
       IndexListEntry *entry =
         static_cast<IndexListEntry*>(
@@ -377,6 +376,11 @@ namespace llvm {
       initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     }
 
+    ~SlotIndexes() {
+      // The indexList's nodes are all allocated in the BumpPtrAllocator.
+      indexList.clearAndLeakNodesUnsafely();
+    }
+
     void getAnalysisUsage(AnalysisUsage &au) const override;
     void releaseMemory() override;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 7ddcc0d81d59..3c32a3e5b794 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -84,6 +84,9 @@ class DWARFFormValue {
                  const DWARFUnit *u) const;
   static bool skipValue(uint16_t form, DataExtractor debug_info_data,
                         uint32_t *offset_ptr, const DWARFUnit *u);
+  static bool skipValue(uint16_t form, DataExtractor debug_info_data,
+                        uint32_t *offset_ptr, uint16_t Version,
+                        uint8_t AddrSize);
 
   static ArrayRef<uint8_t> getFixedFormSizes(uint8_t AddrSize,
                                              uint16_t Version);
diff --git a/include/llvm/ExecutionEngine/Interpreter.h b/include/llvm/ExecutionEngine/Interpreter.h
index f49d0c487fe9..a14707840ad8 100644
--- a/include/llvm/ExecutionEngine/Interpreter.h
+++ b/include/llvm/ExecutionEngine/Interpreter.h
@@ -16,22 +16,12 @@
 #define LLVM_EXECUTIONENGINE_INTERPRETER_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include <cstdlib>
 
 extern "C" void LLVMLinkInInterpreter();
 
 namespace {
   struct ForceInterpreterLinking {
-    ForceInterpreterLinking() {
-      // We must reference the interpreter in such a way that compilers will not
-      // delete it all as dead code, even with whole program optimization,
-      // yet is effectively a NO-OP. As the compiler isn't smart enough
-      // to know that getenv() never returns -1, this will do the job.
-      if (std::getenv("bar") != (char*) -1)
-        return;
-
-      LLVMLinkInInterpreter();
-    }
+    ForceInterpreterLinking() { LLVMLinkInInterpreter(); }
   } ForceInterpreterLinking;
 }
 
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 242d2420162f..7dab5d1bc67f 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -38,8 +38,8 @@ namespace orc {
 /// of the function body from the original module. The extracted body is then
 /// compiled and executed.
 template <typename BaseLayerT,
-          typename CompileCallbackMgrT = JITCompileCallbackManagerBase,
-          typename IndirectStubsMgrT = IndirectStubsManagerBase>
+          typename CompileCallbackMgrT = JITCompileCallbackManager,
+          typename IndirectStubsMgrT = IndirectStubsManager>
 class CompileOnDemandLayer {
 private:
 
diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index cabc95543d81..e490d894390b 100644
--- a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -27,8 +27,8 @@
 namespace llvm {
 namespace orc {
 
-/// @brief Target-independent base class JITCompileCallbackManager.
-class JITCompileCallbackManagerBase {
+/// @brief Target-independent base class for compile callback management.
+class JITCompileCallbackManager {
 public:
 
   typedef std::function<TargetAddress()> CompileFtor;
@@ -50,13 +50,13 @@ class JITCompileCallbackManagerBase {
     CompileFtor &Compile;
   };
 
-  /// @brief Construct a JITCompileCallbackManagerBase.
+  /// @brief Construct a JITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  JITCompileCallbackManagerBase(TargetAddress ErrorHandlerAddress)
+  JITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
     : ErrorHandlerAddress(ErrorHandlerAddress) {}
 
-  virtual ~JITCompileCallbackManagerBase() {}
+  virtual ~JITCompileCallbackManager() {}
 
   /// @brief Execute the callback for the given trampoline id. Called by the JIT
   ///        to compile functions on demand.
@@ -84,7 +84,11 @@ class JITCompileCallbackManagerBase {
   }
 
   /// @brief Reserve a compile callback.
-  virtual CompileCallbackInfo getCompileCallback() = 0;
+  CompileCallbackInfo getCompileCallback() {
+    TargetAddress TrampolineAddr = getAvailableTrampolineAddr();
+    auto &Compile = this->ActiveTrampolines[TrampolineAddr];
+    return CompileCallbackInfo(TrampolineAddr, Compile);
+  }
 
   /// @brief Get a CompileCallbackInfo for an existing callback.
   CompileCallbackInfo getCompileCallbackInfo(TargetAddress TrampolineAddr) {
@@ -113,19 +117,33 @@ class JITCompileCallbackManagerBase {
   std::vector<TargetAddress> AvailableTrampolines;
 
 private:
+
+  TargetAddress getAvailableTrampolineAddr() {
+    if (this->AvailableTrampolines.empty())
+      grow();
+    assert(!this->AvailableTrampolines.empty() &&
+           "Failed to grow available trampolines.");
+    TargetAddress TrampolineAddr = this->AvailableTrampolines.back();
+    this->AvailableTrampolines.pop_back();
+    return TrampolineAddr;
+  }
+
+  // Create new trampolines - to be implemented in subclasses.
+  virtual void grow() = 0;
+
   virtual void anchor();
 };
 
-/// @brief Manage compile callbacks.
+/// @brief Manage compile callbacks for in-process JITs.
 template <typename TargetT>
-class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
+class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
 public:
 
-  /// @brief Construct a JITCompileCallbackManager.
+  /// @brief Construct a InProcessJITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  JITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
-    : JITCompileCallbackManagerBase(ErrorHandlerAddress) {
+  LocalJITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
+    : JITCompileCallbackManager(ErrorHandlerAddress) {
 
     /// Set up the resolver block.
     std::error_code EC;
@@ -145,13 +163,6 @@ class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
     assert(!EC && "Failed to mprotect resolver block");
   }
 
-  /// @brief Get/create a compile callback with the given signature.
-  CompileCallbackInfo getCompileCallback() final {
-    TargetAddress TrampolineAddr = getAvailableTrampolineAddr();
-    auto &Compile = this->ActiveTrampolines[TrampolineAddr];
-    return CompileCallbackInfo(TrampolineAddr, Compile);
-  }
-
 private:
 
   static TargetAddress reenter(void *CCMgr, void *TrampolineId) {
@@ -162,17 +173,7 @@ class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
                reinterpret_cast<uintptr_t>(TrampolineId)));
   }
 
-  TargetAddress getAvailableTrampolineAddr() {
-    if (this->AvailableTrampolines.empty())
-      grow();
-    assert(!this->AvailableTrampolines.empty() &&
-           "Failed to grow available trampolines.");
-    TargetAddress TrampolineAddr = this->AvailableTrampolines.back();
-    this->AvailableTrampolines.pop_back();
-    return TrampolineAddr;
-  }
-
-  void grow() {
+  void grow() override {
     assert(this->AvailableTrampolines.empty() && "Growing prematurely?");
 
     std::error_code EC;
@@ -201,7 +202,7 @@ class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
 					  sys::Memory::MF_READ |
 					  sys::Memory::MF_EXEC);
     assert(!EC && "Failed to mprotect trampoline block");
-    
+
     TrampolineBlocks.push_back(std::move(TrampolineBlock));
   }
 
@@ -210,13 +211,13 @@ class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
 };
 
 /// @brief Base class for managing collections of named indirect stubs.
-class IndirectStubsManagerBase {
+class IndirectStubsManager {
 public:
 
   /// @brief Map type for initializing the manager. See init.
   typedef StringMap<std::pair<TargetAddress, JITSymbolFlags>> StubInitsMap;
 
-  virtual ~IndirectStubsManagerBase() {}
+  virtual ~IndirectStubsManager() {}
 
   /// @brief Create a single stub with the given name, target address and flags.
   virtual std::error_code createStub(StringRef StubName, TargetAddress StubAddr,
@@ -243,7 +244,7 @@ class IndirectStubsManagerBase {
 /// @brief IndirectStubsManager implementation for a concrete target, e.g.
 ///        OrcX86_64. (See OrcTargetSupport.h).
 template <typename TargetT>
-class IndirectStubsManager : public IndirectStubsManagerBase {
+class LocalIndirectStubsManager : public IndirectStubsManager {
 public:
 
   std::error_code createStub(StringRef StubName, TargetAddress StubAddr,
@@ -403,7 +404,7 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV,
                                    ValueMaterializer *Materializer = nullptr,
                                    GlobalVariable *NewGV = nullptr);
 
-/// @brief Clone 
+/// @brief Clone
 GlobalAlias* cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA,
                                   ValueToValueMapTy &VMap);
 
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index 7af662085474..f96e83ed5a1a 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -87,14 +87,6 @@ class ObjectTransformLayer {
     BaseLayer.mapSectionAddress(H, LocalAddress, TargetAddr);
   }
 
-  // Ownership hack.
-  // FIXME: Remove this as soon as RuntimeDyldELF can apply relocations without
-  //        referencing the original object.
-  template <typename OwningMBSet>
-  void takeOwnershipOfBuffers(ObjSetHandleT H, OwningMBSet MBs) {
-    BaseLayer.takeOwnershipOfBuffers(H, std::move(MBs));
-  }
-
   /// @brief Access the transform functor directly.
   TransformFtor &getTransform() { return Transform; }
 
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index c87f1293330b..7d9902f944ed 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -148,17 +148,8 @@ class CallSiteBase {
   /// arguments at this call site.
   typedef IterTy arg_iterator;
 
-  /// arg_begin/arg_end - Return iterators corresponding to the actual argument
-  /// list for a call site.
-  IterTy arg_begin() const {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    // Skip non-arguments
-    return (*this)->op_begin();
-  }
-
-  IterTy arg_end() const { return (*this)->op_end() - getArgumentEndOffset(); }
   iterator_range<IterTy> args() const {
-    return iterator_range<IterTy>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
   bool arg_empty() const { return arg_end() == arg_begin(); }
   unsigned arg_size() const { return unsigned(arg_end() - arg_begin()); }
@@ -182,7 +173,7 @@ class CallSiteBase {
     return (*this)->op_end() - (isCall() ? 1 : 3);
   }
   iterator_range<IterTy> data_ops() const {
-    return iterator_range<IterTy>(data_operands_begin(), data_operands_end());
+    return make_range(data_operands_begin(), data_operands_end());
   }
   bool data_operands_empty() const {
     return data_operands_end() == data_operands_begin();
@@ -387,6 +378,14 @@ class CallSiteBase {
     CALLSITE_DELEGATE_GETTER(getOperandBundle(ID));
   }
 
+  IterTy arg_begin() const {
+    CALLSITE_DELEGATE_GETTER(arg_begin());
+  }
+
+  IterTy arg_end() const {
+    CALLSITE_DELEGATE_GETTER(arg_end());
+  }
+
 #undef CALLSITE_DELEGATE_GETTER
 #undef CALLSITE_DELEGATE_SETTER
 
@@ -460,18 +459,6 @@ class CallSiteBase {
   }
 
 private:
-  unsigned getArgumentEndOffset() const {
-    if (isCall()) {
-      // Skip [ operand bundles ], Callee
-      auto *CI = cast<CallInst>(getInstruction());
-      return 1 + CI->getNumTotalBundleOperands();
-    } else {
-      // Skip [ operand bundles ], BB, BB, Callee
-      auto *II = cast<InvokeInst>(getInstruction());
-      return 3 + II->getNumTotalBundleOperands();
-    }
-  }
-
   IterTy getCallee() const {
     if (isCall()) // Skip Callee
       return cast<CallInst>(getInstruction())->op_end() - 1;
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index a02a0d5c689d..c57342655017 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -72,11 +72,14 @@ namespace CallingConv {
     // Swift - Calling convention for Swift.
     Swift = 16,
 
+    // CXX_FAST_TLS - Calling convention for access functions.
+    CXX_FAST_TLS = 17,
+
     // CLR Virtual Dispatch Stub - Calling convention used for CLR virtual dispatch stub calls
-    CLR_VirtualDispatchStub = 17,
+    CLR_VirtualDispatchStub = 18,
 
     // CLR Secret Parameter - Calling convention used for CLR calls that accept a secret parameter
-    CLR_SecretParameter = 18,
+    CLR_SecretParameter = 19,
 
     // Target - This is the start of the target-specific calling conventions,
     // e.g. fastcall and thiscall on X86.
diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index 59cabd326d28..4caceacbb58e 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h
@@ -105,23 +105,23 @@ class DebugInfoFinder {
   typedef SmallVectorImpl<DIScope *>::const_iterator scope_iterator;
 
   iterator_range<compile_unit_iterator> compile_units() const {
-    return iterator_range<compile_unit_iterator>(CUs.begin(), CUs.end());
+    return make_range(CUs.begin(), CUs.end());
   }
 
   iterator_range<subprogram_iterator> subprograms() const {
-    return iterator_range<subprogram_iterator>(SPs.begin(), SPs.end());
+    return make_range(SPs.begin(), SPs.end());
   }
 
   iterator_range<global_variable_iterator> global_variables() const {
-    return iterator_range<global_variable_iterator>(GVs.begin(), GVs.end());
+    return make_range(GVs.begin(), GVs.end());
   }
 
   iterator_range<type_iterator> types() const {
-    return iterator_range<type_iterator>(TYs.begin(), TYs.end());
+    return make_range(TYs.begin(), TYs.end());
   }
 
   iterator_range<scope_iterator> scopes() const {
-    return iterator_range<scope_iterator>(Scopes.begin(), Scopes.end());
+    return make_range(Scopes.begin(), Scopes.end());
   }
 
   unsigned compile_unit_count() const { return CUs.size(); }
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 68a497745ae7..456313a70e83 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -949,15 +949,16 @@ class DICompileUnit : public DIScope {
           unsigned EmissionKind, DICompositeTypeArray EnumTypes,
           DITypeArray RetainedTypes, DISubprogramArray Subprograms,
           DIGlobalVariableArray GlobalVariables,
-          DIImportedEntityArray ImportedEntities, uint64_t DWOId,
-          StorageType Storage, bool ShouldCreate = true) {
+          DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
+          uint64_t DWOId, StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, SourceLanguage, File,
                    getCanonicalMDString(Context, Producer), IsOptimized,
                    getCanonicalMDString(Context, Flags), RuntimeVersion,
                    getCanonicalMDString(Context, SplitDebugFilename),
                    EmissionKind, EnumTypes.get(), RetainedTypes.get(),
                    Subprograms.get(), GlobalVariables.get(),
-                   ImportedEntities.get(), DWOId, Storage, ShouldCreate);
+                   ImportedEntities.get(), Macros.get(), DWOId, Storage,
+                   ShouldCreate);
   }
   static DICompileUnit *
   getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
@@ -965,15 +966,15 @@ class DICompileUnit : public DIScope {
           unsigned RuntimeVersion, MDString *SplitDebugFilename,
           unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
           Metadata *Subprograms, Metadata *GlobalVariables,
-          Metadata *ImportedEntities, uint64_t DWOId, StorageType Storage,
-          bool ShouldCreate = true);
+          Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDICompileUnit cloneImpl() const {
     return getTemporary(
         getContext(), getSourceLanguage(), getFile(), getProducer(),
         isOptimized(), getFlags(), getRuntimeVersion(), getSplitDebugFilename(),
         getEmissionKind(), getEnumTypes(), getRetainedTypes(), getSubprograms(),
-        getGlobalVariables(), getImportedEntities(), DWOId);
+        getGlobalVariables(), getImportedEntities(), getMacros(), DWOId);
   }
 
   static void get() = delete;
@@ -987,20 +988,22 @@ class DICompileUnit : public DIScope {
        StringRef SplitDebugFilename, unsigned EmissionKind,
        DICompositeTypeArray EnumTypes, DITypeArray RetainedTypes,
        DISubprogramArray Subprograms, DIGlobalVariableArray GlobalVariables,
-       DIImportedEntityArray ImportedEntities, uint64_t DWOId),
+       DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
+       uint64_t DWOId),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, Subprograms,
-       GlobalVariables, ImportedEntities, DWOId))
+       GlobalVariables, ImportedEntities, Macros, DWOId))
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
       (unsigned SourceLanguage, Metadata *File, MDString *Producer,
        bool IsOptimized, MDString *Flags, unsigned RuntimeVersion,
        MDString *SplitDebugFilename, unsigned EmissionKind, Metadata *EnumTypes,
        Metadata *RetainedTypes, Metadata *Subprograms,
-       Metadata *GlobalVariables, Metadata *ImportedEntities, uint64_t DWOId),
+       Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
+       uint64_t DWOId),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, Subprograms,
-       GlobalVariables, ImportedEntities, DWOId))
+       GlobalVariables, ImportedEntities, Macros, DWOId))
 
   TempDICompileUnit clone() const { return cloneImpl(); }
 
@@ -1026,6 +1029,9 @@ class DICompileUnit : public DIScope {
   DIImportedEntityArray getImportedEntities() const {
     return cast_or_null<MDTuple>(getRawImportedEntities());
   }
+  DIMacroNodeArray getMacros() const {
+    return cast_or_null<MDTuple>(getRawMacros());
+  }
   uint64_t getDWOId() const { return DWOId; }
   void setDWOId(uint64_t DwoId) { DWOId = DwoId; }
 
@@ -1039,6 +1045,7 @@ class DICompileUnit : public DIScope {
   Metadata *getRawSubprograms() const { return getOperand(6); }
   Metadata *getRawGlobalVariables() const { return getOperand(7); }
   Metadata *getRawImportedEntities() const { return getOperand(8); }
+  Metadata *getRawMacros() const { return getOperand(9); }
 
   /// \brief Replace arrays.
   ///
@@ -1061,6 +1068,7 @@ class DICompileUnit : public DIScope {
   void replaceImportedEntities(DIImportedEntityArray N) {
     replaceOperandWith(8, N.get());
   }
+  void replaceMacros(DIMacroNodeArray N) { replaceOperandWith(9, N.get()); }
   /// @}
 
   static bool classof(const Metadata *MD) {
@@ -1118,8 +1126,10 @@ class DILocation : public MDNode {
   }
 
   TempDILocation cloneImpl() const {
-    return getTemporary(getContext(), getLine(), getColumn(), getScope(),
-                        getInlinedAt());
+    // Get the raw scope/inlinedAt since it is possible to invoke this on
+    // a DILocation containing temporary metadata.
+    return getTemporary(getContext(), getLine(), getColumn(), getRawScope(),
+                        getRawInlinedAt());
   }
 
   // Disallow replacing operands.
@@ -2197,6 +2207,165 @@ class DIImportedEntity : public DINode {
   }
 };
 
+/// \brief Macro Info DWARF-like metadata node.
+///
+/// A metadata node with a DWARF macro info (i.e., a constant named
+/// \c DW_MACINFO_*, defined in llvm/Support/Dwarf.h).  Called \a DIMacroNode
+/// because it's potentially used for non-DWARF output.
+class DIMacroNode : public MDNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+protected:
+  DIMacroNode(LLVMContext &C, unsigned ID, StorageType Storage, unsigned MIType,
+              ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2 = None)
+      : MDNode(C, ID, Storage, Ops1, Ops2) {
+    assert(MIType < 1u << 16);
+    SubclassData16 = MIType;
+  }
+  ~DIMacroNode() = default;
+
+  template <class Ty> Ty *getOperandAs(unsigned I) const {
+    return cast_or_null<Ty>(getOperand(I));
+  }
+
+  StringRef getStringOperand(unsigned I) const {
+    if (auto *S = getOperandAs<MDString>(I))
+      return S->getString();
+    return StringRef();
+  }
+
+  static MDString *getCanonicalMDString(LLVMContext &Context, StringRef S) {
+    if (S.empty())
+      return nullptr;
+    return MDString::get(Context, S);
+  }
+
+public:
+  unsigned getMacinfoType() const { return SubclassData16; }
+
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+    case DIMacroKind:
+    case DIMacroFileKind:
+      return true;
+    }
+  }
+};
+
+class DIMacro : public DIMacroNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  DIMacro(LLVMContext &C, StorageType Storage, unsigned MIType, unsigned Line,
+          ArrayRef<Metadata *> Ops)
+      : DIMacroNode(C, DIMacroKind, Storage, MIType, Ops), Line(Line) {}
+  ~DIMacro() = default;
+
+  static DIMacro *getImpl(LLVMContext &Context, unsigned MIType, unsigned Line,
+                          StringRef Name, StringRef Value, StorageType Storage,
+                          bool ShouldCreate = true) {
+    return getImpl(Context, MIType, Line, getCanonicalMDString(Context, Name),
+                   getCanonicalMDString(Context, Value), Storage, ShouldCreate);
+  }
+  static DIMacro *getImpl(LLVMContext &Context, unsigned MIType, unsigned Line,
+                          MDString *Name, MDString *Value, StorageType Storage,
+                          bool ShouldCreate = true);
+
+  TempDIMacro cloneImpl() const {
+    return getTemporary(getContext(), getMacinfoType(), getLine(), getName(),
+                        getValue());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, StringRef Name,
+                              StringRef Value = ""),
+                    (MIType, Line, Name, Value))
+  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, MDString *Name,
+                              MDString *Value),
+                    (MIType, Line, Name, Value))
+
+  TempDIMacro clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+
+  StringRef getName() const { return getStringOperand(0); }
+  StringRef getValue() const { return getStringOperand(1); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(0); }
+  MDString *getRawValue() const { return getOperandAs<MDString>(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIMacroKind;
+  }
+};
+
+class DIMacroFile : public DIMacroNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  DIMacroFile(LLVMContext &C, StorageType Storage, unsigned MIType,
+              unsigned Line, ArrayRef<Metadata *> Ops)
+      : DIMacroNode(C, DIMacroFileKind, Storage, MIType, Ops), Line(Line) {}
+  ~DIMacroFile() = default;
+
+  static DIMacroFile *getImpl(LLVMContext &Context, unsigned MIType,
+                              unsigned Line, DIFile *File,
+                              DIMacroNodeArray Elements, StorageType Storage,
+                              bool ShouldCreate = true) {
+    return getImpl(Context, MIType, Line, static_cast<Metadata *>(File),
+                   Elements.get(), Storage, ShouldCreate);
+  }
+
+  static DIMacroFile *getImpl(LLVMContext &Context, unsigned MIType,
+                              unsigned Line, Metadata *File, Metadata *Elements,
+                              StorageType Storage, bool ShouldCreate = true);
+
+  TempDIMacroFile cloneImpl() const {
+    return getTemporary(getContext(), getMacinfoType(), getLine(), getFile(),
+                        getElements());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line, DIFile *File,
+                                  DIMacroNodeArray Elements),
+                    (MIType, Line, File, Elements))
+  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line,
+                                  Metadata *File, Metadata *Elements),
+                    (MIType, Line, File, Elements))
+
+  TempDIMacroFile clone() const { return cloneImpl(); }
+
+  void replaceElements(DIMacroNodeArray Elements) {
+#ifndef NDEBUG
+    for (DIMacroNode *Op : getElements())
+      assert(std::find(Elements->op_begin(), Elements->op_end(), Op) &&
+             "Lost a macro node during macro node list replacement");
+#endif
+    replaceOperandWith(1, Elements.get());
+  }
+
+  unsigned getLine() const { return Line; }
+  DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
+
+  DIMacroNodeArray getElements() const {
+    return cast_or_null<MDTuple>(getRawElements());
+  }
+
+  Metadata *getRawFile() const { return getOperand(0); }
+  Metadata *getRawElements() const { return getOperand(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIMacroFileKind;
+  }
+};
+
 } // end namespace llvm
 
 #undef DEFINE_MDNODE_GET_UNPACK_IMPL
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index aac7a4509f38..e14387677265 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -60,6 +60,7 @@ enum DiagnosticKind {
   DK_OptimizationRemarkAnalysisAliasing,
   DK_OptimizationFailure,
   DK_MIRParser,
+  DK_PGOProfile,
   DK_FirstPluginKind
 };
 
@@ -250,6 +251,31 @@ class DiagnosticInfoSampleProfile : public DiagnosticInfo {
   const Twine &Msg;
 };
 
+/// Diagnostic information for the PGO profiler.
+class DiagnosticInfoPGOProfile : public DiagnosticInfo {
+public:
+  DiagnosticInfoPGOProfile(const char *FileName, const Twine &Msg,
+                           DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_PGOProfile, Severity), FileName(FileName), Msg(Msg) {}
+
+  /// \see DiagnosticInfo::print.
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_PGOProfile;
+  }
+
+  const char *getFileName() const { return FileName; }
+  const Twine &getMsg() const { return Msg; }
+
+private:
+  /// Name of the input file associated with this diagnostic.
+  const char *FileName;
+
+  /// Message to report.
+  const Twine &Msg;
+};
+
 /// Common features for diagnostics dealing with optimization remarks.
 class DiagnosticInfoOptimizationBase : public DiagnosticInfo {
 public:
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 71822a462daa..a55ff5cb6e2b 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -493,11 +493,11 @@ class Function : public GlobalObject, public ilist_node<Function> {
   }
 
   iterator_range<arg_iterator> args() {
-    return iterator_range<arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
   iterator_range<const_arg_iterator> args() const {
-    return iterator_range<const_arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
 /// @}
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 62d3e43f9d60..342bdc01bfbd 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -105,18 +105,13 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   /// hasUniqueInitializer - Whether the global variable has an initializer, and
   /// any changes made to the initializer will turn up in the final executable.
   inline bool hasUniqueInitializer() const {
-    return hasInitializer() &&
-      // It's not safe to modify initializers of global variables with weak
-      // linkage, because the linker might choose to discard the initializer and
-      // use the initializer from another instance of the global variable
-      // instead. It is wrong to modify the initializer of a global variable
-      // with *_odr linkage because then different instances of the global may
-      // have different initializers, breaking the One Definition Rule.
-      !isWeakForLinker() &&
-      // It is not safe to modify initializers of global variables with the
-      // external_initializer marker since the value may be changed at runtime
-      // before C++ initializers are evaluated.
-      !isExternallyInitialized();
+    return
+        // We need to be sure this is the definition that will actually be used
+        isStrongDefinitionForLinker() &&
+        // It is not safe to modify initializers of global variables with the
+        // external_initializer marker since the value may be changed at runtime
+        // before C++ initializers are evaluated.
+        !isExternallyInitialized();
   }
 
   /// getInitializer - Return the initializer for this global variable.  It is
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 81de6999cdb1..2a0927266656 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -1121,14 +1121,12 @@ struct OperandBundleUse {
   explicit OperandBundleUse(StringMapEntry<uint32_t> *Tag, ArrayRef<Use> Inputs)
       : Inputs(Inputs), Tag(Tag) {}
 
-  /// \brief Return true if all the operands in this operand bundle have the
-  /// attribute A.
-  ///
-  /// Currently there is no way to have attributes on operand bundles differ on
-  /// a per operand granularity.
-  bool operandsHaveAttr(Attribute::AttrKind A) const {
+  /// \brief Return true if the operand at index \p Idx in this operand bundle
+  /// has the attribute A.
+  bool operandHasAttr(unsigned Idx, Attribute::AttrKind A) const {
     if (isDeoptOperandBundle())
-      return A == Attribute::ReadOnly || A == Attribute::NoCapture;
+      if (A == Attribute::ReadOnly || A == Attribute::NoCapture)
+        return Inputs[Idx]->getType()->isPointerTy();
 
     // Conservative answer:  no operands have any attributes.
     return false;
@@ -1169,9 +1167,6 @@ template <typename InputTy> class OperandBundleDefT {
   std::vector<InputTy> Inputs;
 
 public:
-  explicit OperandBundleDefT(StringRef Tag, std::vector<InputTy> Inputs)
-      : Tag(Tag), Inputs(std::move(Inputs)) {}
-
   explicit OperandBundleDefT(std::string Tag, std::vector<InputTy> Inputs)
       : Tag(std::move(Tag)), Inputs(std::move(Inputs)) {}
 
@@ -1351,11 +1346,7 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
   /// It is an error to call this with an OpIdx that does not correspond to an
   /// bundle operand.
   OperandBundleUse getOperandBundleForOperand(unsigned OpIdx) const {
-    for (auto &BOI : bundle_op_infos())
-      if (BOI.Begin <= OpIdx && OpIdx < BOI.End)
-        return operandBundleFromBundleOpInfo(BOI);
-
-    llvm_unreachable("Did not find operand bundle for operand!");
+    return operandBundleFromBundleOpInfo(getBundleOpInfoForOperand(OpIdx));
   }
 
   /// \brief Return true if this operand bundle user has operand bundles that
@@ -1382,6 +1373,14 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
     return false;
   }
 
+  /// \brief Return true if the bundle operand at index \p OpIdx has the
+  /// attribute \p A.
+  bool bundleOperandHasAttr(unsigned OpIdx,  Attribute::AttrKind A) const {
+    auto &BOI = getBundleOpInfoForOperand(OpIdx);
+    auto OBU = operandBundleFromBundleOpInfo(BOI);
+    return OBU.operandHasAttr(OpIdx - BOI.Begin, A);
+  }
+
 protected:
   /// \brief Is the function attribute S disallowed by some operand bundle on
   /// this operand bundle user?
@@ -1477,14 +1476,12 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
 
   /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<bundle_op_iterator> bundle_op_infos() {
-    return iterator_range<bundle_op_iterator>(bundle_op_info_begin(),
-                                              bundle_op_info_end());
+    return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
   /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<const_bundle_op_iterator> bundle_op_infos() const {
-    return iterator_range<const_bundle_op_iterator>(bundle_op_info_begin(),
-                                                    bundle_op_info_end());
+    return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
   /// \brief Populate the BundleOpInfo instances and the Use& vector from \p
@@ -1518,6 +1515,18 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
     return It;
   }
 
+  /// \brief Return the BundleOpInfo for the operand at index OpIdx.
+  ///
+  /// It is an error to call this with an OpIdx that does not correspond to an
+  /// bundle operand.
+  const BundleOpInfo &getBundleOpInfoForOperand(unsigned OpIdx) const {
+    for (auto &BOI : bundle_op_infos())
+      if (BOI.Begin <= OpIdx && OpIdx < BOI.End)
+        return BOI;
+
+    llvm_unreachable("Did not find operand bundle for operand!");
+  }
+
   /// \brief Return the total number of values used in \p Bundles.
   static unsigned CountBundleInputs(ArrayRef<OperandBundleDef> Bundles) {
     unsigned Total = 0;
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index c7ba8721fe06..77ba87c6b664 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -66,6 +66,13 @@ class Instruction : public User,
   const Module *getModule() const;
   Module *getModule();
 
+  /// \brief Return the function this instruction belongs to.
+  ///
+  /// Note: it is undefined behavior to call this on an instruction not
+  /// currently inserted into a function.
+  const Function *getFunction() const;
+  Function *getFunction();
+
   /// removeFromParent - This method unlinks 'this' from the containing basic
   /// block, but does not delete it.
   ///
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 5119749ba73c..84ab72138f01 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1543,18 +1543,32 @@ class CallInst : public Instruction,
     setOperand(i, v);
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  op_iterator arg_begin() { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  op_iterator arg_end() {
+    // [ call args ], [ operand bundles ], callee
+    return op_end() - getNumTotalBundleOperands() - 1;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    // The last operand in the op list is the callee - it's not one of the args
-    // so we don't want to iterate over it.
-    return iterator_range<op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(arg_begin(), arg_end());
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  const_op_iterator arg_begin() const { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  const_op_iterator arg_end() const {
+    // [ call args ], [ operand bundles ], callee
+    return op_end() - getNumTotalBundleOperands() - 1;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a call argument.
@@ -2213,7 +2227,7 @@ class ExtractValueInst : public UnaryInstruction {
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
-    return iterator_range<idx_iterator>(idx_begin(), idx_end());
+    return make_range(idx_begin(), idx_end());
   }
 
   Value *getAggregateOperand() {
@@ -2330,7 +2344,7 @@ class InsertValueInst : public Instruction {
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
-    return iterator_range<idx_iterator>(idx_begin(), idx_end());
+    return make_range(idx_begin(), idx_end());
   }
 
   Value *getAggregateOperand() {
@@ -3105,12 +3119,12 @@ class SwitchInst : public TerminatorInst {
 
   /// cases - iteration adapter for range-for loops.
   iterator_range<CaseIt> cases() {
-    return iterator_range<CaseIt>(case_begin(), case_end());
+    return make_range(case_begin(), case_end());
   }
 
   /// cases - iteration adapter for range-for loops.
   iterator_range<ConstCaseIt> cases() const {
-    return iterator_range<ConstCaseIt>(case_begin(), case_end());
+    return make_range(case_begin(), case_end());
   }
 
   /// Returns an iterator that points to the default case.
@@ -3452,16 +3466,32 @@ class InvokeInst : public TerminatorInst,
     setOperand(i, v);
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  op_iterator arg_begin() { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  op_iterator arg_end() {
+    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
+    return op_end() - getNumTotalBundleOperands() - 3;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(arg_begin(), arg_end());
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  const_op_iterator arg_begin() const { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  const_op_iterator arg_end() const {
+    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
+    return op_end() - getNumTotalBundleOperands() - 3;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a invoke argument.
@@ -3915,12 +3945,12 @@ class CatchPadInst : public TerminatorInst {
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(op_begin(), op_end() - 2);
+    return make_range(op_begin(), op_end() - 2);
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), op_end() - 2);
+    return make_range(op_begin(), op_end() - 2);
   }
 
   /// \brief Wrappers for getting the \c Use of a catchpad argument.
@@ -4039,12 +4069,12 @@ class TerminatePadInst : public TerminatorInst {
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(op_begin(), arg_end());
+    return make_range(op_begin(), arg_end());
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), arg_end());
+    return make_range(op_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a terminatepad argument.
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 57ad278a68bd..370b527c6f7c 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1840,6 +1840,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadArgMem]>;
+
+  def int_x86_avx512_mask_move_ss : GCCBuiltin<"__builtin_ia32_movss_mask">,
+        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_move_sd : GCCBuiltin<"__builtin_ia32_movsd_mask">,
+        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
 }
 
 // Conditional store ops
@@ -4019,6 +4026,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
               Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
                          [IntrNoMem]>;
+  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                         [IntrNoMem]>;
   def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
               Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
@@ -5198,64 +5211,64 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_mask">,
+  def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_mask">,
+  def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_mask">,
+  def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_mask">,
+  def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_mask">,
+  def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_mask">,
+  def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_mask">,
+  def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_mask">,
+  def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_mask">,
+  def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_mask">,
+  def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_mask">,
+  def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_mask">,
+  def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_mask">,
+  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_mask">,
+  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess_mask">,
+  def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd_mask">,
+  def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">,
+  def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">,
+  def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
@@ -5284,10 +5297,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, 
                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss_mask">,
+  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd_mask">,
+  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
@@ -5372,21 +5385,21 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_getmant_ss :
-         GCCBuiltin<"__builtin_ia32_getmantss_mask">,
+         GCCBuiltin<"__builtin_ia32_getmantss_round">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_getmant_sd :
-         GCCBuiltin<"__builtin_ia32_getmantsd_mask">,
+         GCCBuiltin<"__builtin_ia32_getmantsd_round">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
+  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd_mask">,
+  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
@@ -5396,10 +5409,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss_mask">,
+  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd_mask">,
+  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
@@ -5423,11 +5436,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_mask">,
+  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_mask">,
+  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
@@ -5439,11 +5452,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_mask">,
+  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_mask">,
+  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index 3a0385581509..b8e33478d6a9 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Pass.h"
@@ -250,7 +251,43 @@ class PMTopLevelManager {
   /// Map from ID to immutable passes.
   SmallDenseMap<AnalysisID, ImmutablePass *, 8> ImmutablePassMap;
 
-  DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
+
+  /// A wrapper around AnalysisUsage for the purpose of uniqueing.  The wrapper
+  /// is used to avoid needing to make AnalysisUsage itself a folding set node.
+  struct AUFoldingSetNode : public FoldingSetNode {
+    AnalysisUsage AU;
+    AUFoldingSetNode(const AnalysisUsage &AU) : AU(AU) {}
+    void Profile(FoldingSetNodeID &ID) const {
+      Profile(ID, AU);
+    }
+    static void Profile(FoldingSetNodeID &ID, const AnalysisUsage &AU) {
+      // TODO: We could consider sorting the dependency arrays within the
+      // AnalysisUsage (since they are conceptually unordered).
+      ID.AddBoolean(AU.getPreservesAll());
+      auto ProfileVec = [&](const SmallVectorImpl<AnalysisID>& Vec) {
+        ID.AddInteger(Vec.size());
+        for(AnalysisID AID : Vec)
+          ID.AddPointer(AID);
+      };
+      ProfileVec(AU.getRequiredSet());
+      ProfileVec(AU.getRequiredTransitiveSet());
+      ProfileVec(AU.getPreservedSet());
+      ProfileVec(AU.getUsedSet());
+    }
+  };
+
+  // Contains all of the unique combinations of AnalysisUsage.  This is helpful
+  // when we have multiple instances of the same pass since they'll usually
+  // have the same analysis usage and can share storage.
+  FoldingSet<AUFoldingSetNode> UniqueAnalysisUsages;
+  
+  // Allocator used for allocating UAFoldingSetNodes.  This handles deletion of
+  // all allocated nodes in one fell swoop.
+  SpecificBumpPtrAllocator<AUFoldingSetNode> AUFoldingSetNodeAllocator;
+  
+  // Maps from a pass to it's associated entry in UniqueAnalysisUsages.  Does
+  // not own the storage associated with either key or value.. 
+  DenseMap<Pass *, AnalysisUsage*> AnUsageMap;
 
   /// Collection of PassInfo objects found via analysis IDs and in this top
   /// level manager. This is used to memoize queries to the pass registry.
diff --git a/include/llvm/IR/Metadata.def b/include/llvm/IR/Metadata.def
index 9d6f929be349..b1d22178e262 100644
--- a/include/llvm/IR/Metadata.def
+++ b/include/llvm/IR/Metadata.def
@@ -108,6 +108,9 @@ HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIGlobalVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DILocalVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIObjCProperty)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIImportedEntity)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(DIMacroNode)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacro)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacroFile)
 
 #undef HANDLE_METADATA
 #undef HANDLE_METADATA_LEAF
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 8805cec1471b..84c82476c4f6 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -83,7 +83,9 @@ class Metadata {
     DIImportedEntityKind,
     ConstantAsMetadataKind,
     LocalAsMetadataKind,
-    MDStringKind
+    MDStringKind,
+    DIMacroKind,
+    DIMacroFileKind
   };
 
 protected:
@@ -1210,10 +1212,10 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   const_op_iterator op_end()   const { return const_op_iterator(this, getNumOperands()); }
 
   inline iterator_range<op_iterator>  operands() {
-    return iterator_range<op_iterator>(op_begin(), op_end());
+    return make_range(op_begin(), op_end());
   }
   inline iterator_range<const_op_iterator> operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), op_end());
+    return make_range(op_begin(), op_end());
   }
 };
 
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 4e99c4256730..2378b6d83d87 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_IR_MODULE_H
 #define LLVM_IR_MODULE_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
@@ -514,10 +515,10 @@ class Module {
   bool                  global_empty() const { return GlobalList.empty(); }
 
   iterator_range<global_iterator> globals() {
-    return iterator_range<global_iterator>(global_begin(), global_end());
+    return make_range(global_begin(), global_end());
   }
   iterator_range<const_global_iterator> globals() const {
-    return iterator_range<const_global_iterator>(global_begin(), global_end());
+    return make_range(global_begin(), global_end());
   }
 
 /// @}
@@ -536,10 +537,10 @@ class Module {
   bool                    empty() const { return FunctionList.empty(); }
 
   iterator_range<iterator> functions() {
-    return iterator_range<iterator>(begin(), end());
+    return make_range(begin(), end());
   }
   iterator_range<const_iterator> functions() const {
-    return iterator_range<const_iterator>(begin(), end());
+    return make_range(begin(), end());
   }
 
 /// @}
@@ -554,10 +555,10 @@ class Module {
   bool                 alias_empty() const      { return AliasList.empty(); }
 
   iterator_range<alias_iterator> aliases() {
-    return iterator_range<alias_iterator>(alias_begin(), alias_end());
+    return make_range(alias_begin(), alias_end());
   }
   iterator_range<const_alias_iterator> aliases() const {
-    return iterator_range<const_alias_iterator>(alias_begin(), alias_end());
+    return make_range(alias_begin(), alias_end());
   }
 
 /// @}
@@ -578,12 +579,10 @@ class Module {
   bool named_metadata_empty() const { return NamedMDList.empty(); }
 
   iterator_range<named_metadata_iterator> named_metadata() {
-    return iterator_range<named_metadata_iterator>(named_metadata_begin(),
-                                                   named_metadata_end());
+    return make_range(named_metadata_begin(), named_metadata_end());
   }
   iterator_range<const_named_metadata_iterator> named_metadata() const {
-    return iterator_range<const_named_metadata_iterator>(named_metadata_begin(),
-                                                         named_metadata_end());
+    return make_range(named_metadata_begin(), named_metadata_end());
   }
 
   /// Destroy ConstantArrays in LLVMContext if they are not used.
@@ -639,6 +638,16 @@ class Module {
   /// \brief Set the PIC level (small or large model)
   void setPICLevel(PICLevel::Level PL);
 /// @}
+
+  /// @name Utility functions for querying and setting PGO counts
+  /// @{
+
+  /// \brief Set maximum function count in PGO mode
+  void setMaximumFunctionCount(uint64_t);
+
+  /// \brief Returns maximum function count in PGO mode
+  Optional<uint64_t> getMaximumFunctionCount();
+  /// @}
 };
 
 /// An raw_ostream inserter for modules.
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 8edc17c636f8..9694ec2c73d1 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -173,7 +173,7 @@ class StatepointBase {
 
   /// range adapter for call arguments
   iterator_range<arg_iterator> call_args() const {
-    return iterator_range<arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Return true if the call or the callee has the given attribute.
@@ -201,8 +201,7 @@ class StatepointBase {
 
   /// range adapter for GC transition arguments
   iterator_range<arg_iterator> gc_transition_args() const {
-    return iterator_range<arg_iterator>(gc_transition_args_begin(),
-                                        gc_transition_args_end());
+    return make_range(gc_transition_args_begin(), gc_transition_args_end());
   }
 
   /// Number of additional arguments excluding those intended
@@ -225,7 +224,7 @@ class StatepointBase {
 
   /// range adapter for vm state arguments
   iterator_range<arg_iterator> vm_state_args() const {
-    return iterator_range<arg_iterator>(vm_state_begin(), vm_state_end());
+    return make_range(vm_state_begin(), vm_state_end());
   }
 
   typename CallSiteTy::arg_iterator gc_args_begin() const {
@@ -241,7 +240,7 @@ class StatepointBase {
 
   /// range adapter for gc arguments
   iterator_range<arg_iterator> gc_args() const {
-    return iterator_range<arg_iterator>(gc_args_begin(), gc_args_end());
+    return make_range(gc_args_begin(), gc_args_end());
   }
 
   /// Get list of all gc reloactes linked to this statepoint
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 78a3b43c86d2..639dc5c01c8c 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -233,7 +233,7 @@ class User : public Value {
     return value_op_iterator(op_end());
   }
   iterator_range<value_op_iterator> operand_values() {
-    return iterator_range<value_op_iterator>(value_op_begin(), value_op_end());
+    return make_range(value_op_begin(), value_op_end());
   }
 
   /// \brief Drop all references to operands.
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 9a87a7178866..7f11ba3d1f6d 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -283,10 +283,10 @@ class Value {
   use_iterator       use_end()         { return use_iterator();   }
   const_use_iterator use_end()   const { return const_use_iterator();   }
   iterator_range<use_iterator> uses() {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
   iterator_range<const_use_iterator> uses() const {
-    return iterator_range<const_use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
 
   bool               user_empty() const { return UseList == nullptr; }
@@ -300,10 +300,10 @@ class Value {
   User               *user_back()        { return *user_begin(); }
   const User         *user_back()  const { return *user_begin(); }
   iterator_range<user_iterator> users() {
-    return iterator_range<user_iterator>(user_begin(), user_end());
+    return make_range(user_begin(), user_end());
   }
   iterator_range<const_user_iterator> users() const {
-    return iterator_range<const_user_iterator>(user_begin(), user_end());
+    return make_range(user_begin(), user_end());
   }
 
   /// \brief Return true if there is exactly one user of this value.
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 29c8b27db36a..72b6a0f011be 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -117,6 +117,8 @@ void initializeEdgeBundlesPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry &);
 void initializeGCOVProfilerPass(PassRegistry&);
+void initializePGOInstrumentationGenPass(PassRegistry&);
+void initializePGOInstrumentationUsePass(PassRegistry&);
 void initializeInstrProfilingPass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index c322288a1ae9..8a79e6044f5b 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -62,8 +62,7 @@ namespace llvm {
 struct LTOCodeGenerator {
   static const char *getVersionString();
 
-  LTOCodeGenerator();
-  LTOCodeGenerator(std::unique_ptr<LLVMContext> Context);
+  LTOCodeGenerator(LLVMContext &Context);
   ~LTOCodeGenerator();
 
   /// Merge given module.  Return true on success.
@@ -149,6 +148,8 @@ struct LTOCodeGenerator {
 
   LLVMContext &getContext() { return Context; }
 
+  void resetMergedModule() { MergedModule.reset(); }
+
 private:
   void initializeLTOPasses();
 
@@ -168,7 +169,6 @@ struct LTOCodeGenerator {
 
   typedef StringMap<uint8_t> StringSet;
 
-  std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   std::unique_ptr<Module> MergedModule;
   std::unique_ptr<Linker> IRLinker;
diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index c3e86afe1d82..97b5865bd47f 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h
@@ -91,25 +91,24 @@ struct LTOModule {
   /// InitializeAllTargetMCs();
   /// InitializeAllAsmPrinters();
   /// InitializeAllAsmParsers();
-  static LTOModule *createFromFile(const char *path, TargetOptions options,
-                                   std::string &errMsg);
-  static LTOModule *createFromOpenFile(int fd, const char *path, size_t size,
-                                       TargetOptions options,
-                                       std::string &errMsg);
-  static LTOModule *createFromOpenFileSlice(int fd, const char *path,
-                                            size_t map_size, off_t offset,
-                                            TargetOptions options,
-                                            std::string &errMsg);
-  static LTOModule *createFromBuffer(const void *mem, size_t length,
-                                     TargetOptions options, std::string &errMsg,
-                                     StringRef path = "");
-
-  static LTOModule *createInLocalContext(const void *mem, size_t length,
-                                         TargetOptions options,
-                                         std::string &errMsg, StringRef path);
-  static LTOModule *createInContext(const void *mem, size_t length,
-                                    TargetOptions options, std::string &errMsg,
-                                    StringRef path, LLVMContext *Context);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromFile(LLVMContext &Context, const char *path, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromOpenFile(LLVMContext &Context, int fd, const char *path,
+                     size_t size, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromOpenFileSlice(LLVMContext &Context, int fd, const char *path,
+                          size_t map_size, off_t offset, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromBuffer(LLVMContext &Context, const void *mem, size_t length,
+                   TargetOptions options, StringRef path = "");
+
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createInLocalContext(const void *mem, size_t length, TargetOptions options,
+                       StringRef path);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createInContext(const void *mem, size_t length, TargetOptions options,
+                  StringRef path, LLVMContext *Context);
 
   const Module &getModule() const {
     return const_cast<LTOModule*>(this)->getModule();
@@ -170,7 +169,7 @@ struct LTOModule {
 
   /// Parse the symbols from the module and model-level ASM and add them to
   /// either the defined or undefined lists.
-  bool parseSymbols(std::string &errMsg);
+  void parseSymbols();
 
   /// Add a symbol which isn't defined just yet to a list to be resolved later.
   void addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
@@ -207,8 +206,9 @@ struct LTOModule {
   bool objcClassNameFromExpression(const Constant *c, std::string &name);
 
   /// Create an LTOModule (private version).
-  static LTOModule *makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
-                                  std::string &errMsg, LLVMContext *Context);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                LLVMContext *Context);
 };
 }
 #endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index fbc112ba45b7..29fcd93a2a1c 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -85,7 +85,10 @@ namespace {
       (void) llvm::createDomOnlyViewerPass();
       (void) llvm::createDomViewerPass();
       (void) llvm::createGCOVProfilerPass();
+      (void) llvm::createPGOInstrumentationGenPass();
+      (void) llvm::createPGOInstrumentationUsePass();
       (void) llvm::createInstrProfilingPass();
+      (void) llvm::createFunctionImportPass();
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
       (void) llvm::createGlobalDCEPass();
diff --git a/include/llvm/Linker/IRMover.h b/include/llvm/Linker/IRMover.h
new file mode 100644
index 000000000000..89d02f129c87
--- /dev/null
+++ b/include/llvm/Linker/IRMover.h
@@ -0,0 +1,78 @@
+//===- IRMover.h ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LINKER_IRMOVER_H
+#define LLVM_LINKER_IRMOVER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/DiagnosticInfo.h"
+
+namespace llvm {
+class GlobalValue;
+class Module;
+class StructType;
+class Type;
+
+class IRMover {
+  struct StructTypeKeyInfo {
+    struct KeyTy {
+      ArrayRef<Type *> ETypes;
+      bool IsPacked;
+      KeyTy(ArrayRef<Type *> E, bool P);
+      KeyTy(const StructType *ST);
+      bool operator==(const KeyTy &that) const;
+      bool operator!=(const KeyTy &that) const;
+    };
+    static StructType *getEmptyKey();
+    static StructType *getTombstoneKey();
+    static unsigned getHashValue(const KeyTy &Key);
+    static unsigned getHashValue(const StructType *ST);
+    static bool isEqual(const KeyTy &LHS, const StructType *RHS);
+    static bool isEqual(const StructType *LHS, const StructType *RHS);
+  };
+
+public:
+  class IdentifiedStructTypeSet {
+    // The set of opaque types is the composite module.
+    DenseSet<StructType *> OpaqueStructTypes;
+
+    // The set of identified but non opaque structures in the composite module.
+    DenseSet<StructType *, StructTypeKeyInfo> NonOpaqueStructTypes;
+
+  public:
+    void addNonOpaque(StructType *Ty);
+    void switchToNonOpaque(StructType *Ty);
+    void addOpaque(StructType *Ty);
+    StructType *findNonOpaque(ArrayRef<Type *> ETypes, bool IsPacked);
+    bool hasType(StructType *Ty);
+  };
+
+  IRMover(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
+
+  typedef std::function<void(GlobalValue &)> ValueAdder;
+  /// Move in the provide values. The source is destroyed.
+  /// Returns true on error.
+  bool move(Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
+            std::function<void(GlobalValue &GV, ValueAdder Add)> AddLazyFor);
+  Module &getModule() { return Composite; }
+
+  DiagnosticHandlerFunction getDiagnosticHandler() const {
+    return DiagnosticHandler;
+  }
+
+private:
+  Module &Composite;
+  IdentifiedStructTypeSet IdentifiedStructTypes;
+  DiagnosticHandlerFunction DiagnosticHandler;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 0c7dc910a65c..9ff61bc518b0 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -10,11 +10,9 @@
 #ifndef LLVM_LINKER_LINKER_H
 #define LLVM_LINKER_LINKER_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/FunctionInfo.h"
+#include "llvm/Linker/IRMover.h"
 
 namespace llvm {
 class Module;
@@ -26,41 +24,9 @@ class Type;
 /// module since it is assumed that the user of this class will want to do
 /// something with it after the linking.
 class Linker {
-public:
-  struct StructTypeKeyInfo {
-    struct KeyTy {
-      ArrayRef<Type *> ETypes;
-      bool IsPacked;
-      KeyTy(ArrayRef<Type *> E, bool P);
-      KeyTy(const StructType *ST);
-      bool operator==(const KeyTy &that) const;
-      bool operator!=(const KeyTy &that) const;
-    };
-    static StructType *getEmptyKey();
-    static StructType *getTombstoneKey();
-    static unsigned getHashValue(const KeyTy &Key);
-    static unsigned getHashValue(const StructType *ST);
-    static bool isEqual(const KeyTy &LHS, const StructType *RHS);
-    static bool isEqual(const StructType *LHS, const StructType *RHS);
-  };
-
-  typedef DenseSet<StructType *, StructTypeKeyInfo> NonOpaqueStructTypeSet;
-  typedef DenseSet<StructType *> OpaqueStructTypeSet;
-
-  struct IdentifiedStructTypeSet {
-    // The set of opaque types is the composite module.
-    OpaqueStructTypeSet OpaqueStructTypes;
-
-    // The set of identified but non opaque structures in the composite module.
-    NonOpaqueStructTypeSet NonOpaqueStructTypes;
-
-    void addNonOpaque(StructType *Ty);
-    void switchToNonOpaque(StructType *Ty);
-    void addOpaque(StructType *Ty);
-    StructType *findNonOpaque(ArrayRef<Type *> ETypes, bool IsPacked);
-    bool hasType(StructType *Ty);
-  };
+  IRMover Mover;
 
+public:
   enum Flags {
     None = 0,
     OverrideFromSrc = (1 << 0),
@@ -69,9 +35,6 @@ class Linker {
   };
 
   Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
-  Linker(Module &M);
-
-  Module &getModule() const { return Composite; }
 
   /// \brief Link \p Src into the composite. The source is destroyed.
   ///
@@ -90,17 +53,18 @@ class Linker {
                           DiagnosticHandlerFunction DiagnosticHandler,
                           unsigned Flags = Flags::None);
 
-  static bool linkModules(Module &Dest, Module &Src,
-                          unsigned Flags = Flags::None);
-
-private:
-  Module &Composite;
-
-  IdentifiedStructTypeSet IdentifiedStructTypes;
-
-  DiagnosticHandlerFunction DiagnosticHandler;
+  DiagnosticHandlerFunction getDiagnosticHandler() const {
+    return Mover.getDiagnosticHandler();
+  }
 };
 
+/// Create a new module with exported local functions renamed and promoted
+/// for ThinLTO.
+std::unique_ptr<Module>
+renameModuleForThinLTO(std::unique_ptr<Module> &M,
+                       const FunctionInfoIndex *Index,
+                       DiagnosticHandlerFunction DiagnosticHandler);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 1baf82ee5c45..88aab73d4058 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -15,12 +15,12 @@
 #ifndef LLVM_MC_MCINSTRDESC_H
 #define LLVM_MC_MCINSTRDESC_H
 
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
   class MCInst;
-  class MCRegisterInfo;
   class MCSubtargetInfo;
   class FeatureBitset;
 
@@ -137,16 +137,16 @@ enum Flag {
 /// directly to describe itself.
 class MCInstrDesc {
 public:
-  unsigned short Opcode;        // The opcode number
-  unsigned short NumOperands;   // Num of args (may be more if variable_ops)
-  unsigned char NumDefs;        // Num of args that are definitions
-  unsigned char Size;           // Number of bytes in encoding.
-  unsigned short SchedClass;    // enum identifying instr sched class
-  uint64_t Flags;               // Flags identifying machine instr class
-  uint64_t TSFlags;             // Target Specific Flag values
-  const uint16_t *ImplicitUses; // Registers implicitly read by this instr
-  const uint16_t *ImplicitDefs; // Registers implicitly defined by this instr
-  const MCOperandInfo *OpInfo;  // 'NumOperands' entries about operands
+  unsigned short Opcode;         // The opcode number
+  unsigned short NumOperands;    // Num of args (may be more if variable_ops)
+  unsigned char NumDefs;         // Num of args that are definitions
+  unsigned char Size;            // Number of bytes in encoding.
+  unsigned short SchedClass;     // enum identifying instr sched class
+  uint64_t Flags;                // Flags identifying machine instr class
+  uint64_t TSFlags;              // Target Specific Flag values
+  const MCPhysReg *ImplicitUses; // Registers implicitly read by this instr
+  const MCPhysReg *ImplicitDefs; // Registers implicitly defined by this instr
+  const MCOperandInfo *OpInfo;   // 'NumOperands' entries about operands
   // Subtarget feature that this is deprecated on, if any
   // -1 implies this is not deprecated by any single feature. It may still be 
   // deprecated due to a "complex" reason, below.
@@ -472,7 +472,7 @@ class MCInstrDesc {
   /// marked as implicitly reading the 'CL' register, which it always does.
   ///
   /// This method returns null if the instruction has no implicit uses.
-  const uint16_t *getImplicitUses() const { return ImplicitUses; }
+  const MCPhysReg *getImplicitUses() const { return ImplicitUses; }
 
   /// \brief Return the number of implicit uses this instruction has.
   unsigned getNumImplicitUses() const {
@@ -494,7 +494,7 @@ class MCInstrDesc {
   /// EAX/EDX/EFLAGS registers.
   ///
   /// This method returns null if the instruction has no implicit defs.
-  const uint16_t *getImplicitDefs() const { return ImplicitDefs; }
+  const MCPhysReg *getImplicitDefs() const { return ImplicitDefs; }
 
   /// \brief Return the number of implicit defs this instruct has.
   unsigned getNumImplicitDefs() const {
@@ -509,7 +509,7 @@ class MCInstrDesc {
   /// \brief Return true if this instruction implicitly
   /// uses the specified physical register.
   bool hasImplicitUseOfPhysReg(unsigned Reg) const {
-    if (const uint16_t *ImpUses = ImplicitUses)
+    if (const MCPhysReg *ImpUses = ImplicitUses)
       for (; *ImpUses; ++ImpUses)
         if (*ImpUses == Reg)
           return true;
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 388a208fb4a0..cf2c3f12bb6b 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -118,6 +118,7 @@ class MCObjectFileInfo {
 
   // These are for Fission DWP files.
   MCSection *DwarfCUIndexSection;
+  MCSection *DwarfTUIndexSection;
 
   /// Section for newer gnu pubnames.
   MCSection *DwarfGnuPubNamesSection;
@@ -266,6 +267,7 @@ class MCObjectFileInfo {
   MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; }
   MCSection *getDwarfAddrSection() const { return DwarfAddrSection; }
   MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
+  MCSection *getDwarfTUIndexSection() const { return DwarfTUIndexSection; }
 
   MCSection *getCOFFDebugSymbolsSection() const {
     return COFFDebugSymbolsSection;
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index a68f200ce3fb..8dd042a2533f 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -191,14 +191,13 @@ class Archive : public Binary {
   child_iterator child_begin(bool SkipInternal = true) const;
   child_iterator child_end() const;
   iterator_range<child_iterator> children(bool SkipInternal = true) const {
-    return iterator_range<child_iterator>(child_begin(SkipInternal),
-                                          child_end());
+    return make_range(child_begin(SkipInternal), child_end());
   }
 
   symbol_iterator symbol_begin() const;
   symbol_iterator symbol_end() const;
   iterator_range<symbol_iterator> symbols() const {
-    return iterator_range<symbol_iterator>(symbol_begin(), symbol_end());
+    return make_range(symbol_begin(), symbol_end());
   }
 
   // Cast methods.
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 08131908e067..ce0c891ee0c2 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -100,8 +100,7 @@ class SectionRef {
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
   iterator_range<relocation_iterator> relocations() const {
-    return iterator_range<relocation_iterator>(relocation_begin(),
-                                               relocation_end());
+    return make_range(relocation_begin(), relocation_end());
   }
   section_iterator getRelocatedSection() const;
 
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index f6265b62cbf5..492a4ef464f8 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -36,11 +36,17 @@ namespace llvm {
 ///
 class AnalysisUsage {
 public:
-  typedef SmallVector<AnalysisID, 32> VectorType;
+  typedef SmallVectorImpl<AnalysisID> VectorType;
 
 private:
   /// Sets of analyses required and preserved by a pass
-  VectorType Required, RequiredTransitive, Preserved, Used;
+  // TODO: It's not clear that SmallVector is an appropriate data structure for
+  // this usecase.  The sizes were picked to minimize wasted space, but are
+  // otherwise fairly meaningless.
+  SmallVector<AnalysisID, 8> Required;
+  SmallVector<AnalysisID, 2> RequiredTransitive;
+  SmallVector<AnalysisID, 2> Preserved;
+  SmallVector<AnalysisID, 0> Used;
   bool PreservesAll;
 
 public:
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 956485119102..2730cc167f7d 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -16,9 +16,9 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROF_H_
 #define LLVM_PROFILEDATA_INSTRPROF_H_
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/ProfileData/InstrProfData.inc"
 #include "llvm/Support/Endian.h"
@@ -30,6 +30,7 @@
 #include <system_error>
 #include <vector>
 
+#define INSTR_PROF_INDEX_VERSION 3
 namespace llvm {
 
 class Function;
@@ -132,7 +133,8 @@ inline StringRef getInstrProfFileOverriderFuncName() {
 
 /// Return the modified name for function \c F suitable to be
 /// used the key for profile lookup.
-std::string getPGOFuncName(const Function &F);
+std::string getPGOFuncName(const Function &F,
+                           uint64_t Version = INSTR_PROF_INDEX_VERSION);
 
 /// Return the modified name for a function suitable to be
 /// used the key for profile lookup. The function's original
@@ -140,7 +142,8 @@ std::string getPGOFuncName(const Function &F);
 /// The function is defined in module \c FileName.
 std::string getPGOFuncName(StringRef RawFuncName,
                            GlobalValue::LinkageTypes Linkage,
-                           StringRef FileName);
+                           StringRef FileName,
+                           uint64_t Version = INSTR_PROF_INDEX_VERSION);
 
 /// Create and return the global variable for function name used in PGO
 /// instrumentation. \c FuncName is the name of the function returned
@@ -432,7 +435,7 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
 
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool ResultOverflowed;
-    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
+    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], &ResultOverflowed);
     if (ResultOverflowed)
       Result = instrprof_error::counter_overflow;
   }
@@ -504,7 +507,7 @@ static inline uint64_t ComputeHash(HashT Type, StringRef K) {
 }
 
 const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
-const uint64_t Version = 3;
+const uint64_t Version = INSTR_PROF_INDEX_VERSION;
 const HashT HashType = HashT::MD5;
 
 // This structure defines the file header of the LLVM profile
diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index aefdbc1b3e47..48dae506cabb 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -362,6 +362,7 @@ uint32_t getValueProfDataSizeRT(const ValueProfRuntimeRecord *Record);
 ValueProfData *
 serializeValueProfDataFromRT(const ValueProfRuntimeRecord *Record,
                              ValueProfData *Dst);
+uint32_t getNumValueKindsRT(const void *R);
 
 #undef INSTR_PROF_VALUE_PROF_DATA
 #endif  /* INSTR_PROF_VALUE_PROF_DATA */ 
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 49233366e164..304606de6791 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -136,7 +136,6 @@ class RawInstrProfReader : public InstrProfReader {
   bool ShouldSwapBytes;
   uint64_t CountersDelta;
   uint64_t NamesDelta;
-  uint64_t ValueDataDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
   const uint64_t *CountersStart;
@@ -144,6 +143,7 @@ class RawInstrProfReader : public InstrProfReader {
   const uint8_t *ValueDataStart;
   const char *ProfileEnd;
   uint32_t ValueKindLast;
+  uint32_t CurValueDataSize;
 
   // String table for holding a unique copy of all the strings in the profile.
   InstrProfStringTable StringTable;
@@ -162,10 +162,19 @@ class RawInstrProfReader : public InstrProfReader {
 private:
   std::error_code readNextHeader(const char *CurrentPos);
   std::error_code readHeader(const RawInstrProf::Header &Header);
-  template <class IntT>
-  IntT swap(IntT Int) const {
+  template <class IntT> IntT swap(IntT Int) const {
     return ShouldSwapBytes ? sys::getSwappedBytes(Int) : Int;
   }
+  support::endianness getDataEndianness() const {
+    support::endianness HostEndian = getHostEndianness();
+    if (!ShouldSwapBytes)
+      return HostEndian;
+    if (HostEndian == support::little)
+      return support::big;
+    else
+      return support::little;
+  }
+
   inline uint8_t getNumPaddingBytes(uint64_t SizeInBytes) {
     return 7 & (sizeof(uint64_t) - SizeInBytes % sizeof(uint64_t));
   }
@@ -174,7 +183,10 @@ class RawInstrProfReader : public InstrProfReader {
   std::error_code readRawCounts(InstrProfRecord &Record);
   std::error_code readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
-  void advanceData() { Data++; }
+  void advanceData() {
+    Data++;
+    ValueDataStart += CurValueDataSize;
+  }
 
   const uint64_t *getCounter(IntPtrT CounterPtr) const {
     ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
@@ -184,17 +196,6 @@ class RawInstrProfReader : public InstrProfReader {
     ptrdiff_t Offset = (swap(NamePtr) - NamesDelta) / sizeof(char);
     return NamesStart + Offset;
   }
-  const uint8_t *getValueDataCounts(IntPtrT ValueCountsPtr) const {
-    ptrdiff_t Offset =
-        (swap(ValueCountsPtr) - ValueDataDelta) / sizeof(uint8_t);
-    return ValueDataStart + Offset;
-  }
-  // This accepts an already byte-swapped ValueDataPtr argument.
-  const InstrProfValueData *getValueData(IntPtrT ValueDataPtr) const {
-    ptrdiff_t Offset = (ValueDataPtr - ValueDataDelta) / sizeof(uint8_t);
-    return reinterpret_cast<const InstrProfValueData *>(ValueDataStart +
-                                                        Offset);
-  }
 };
 
 typedef RawInstrProfReader<uint32_t> RawInstrProfReader32;
@@ -270,6 +271,7 @@ struct InstrProfReaderIndexBase {
   virtual bool atEnd() const = 0;
   virtual void setValueProfDataEndianness(support::endianness Endianness) = 0;
   virtual ~InstrProfReaderIndexBase() {}
+  virtual uint64_t getVersion() const = 0;
 };
 
 typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
@@ -303,6 +305,7 @@ class InstrProfReaderIndex : public InstrProfReaderIndexBase {
     HashTable->getInfoObj().setValueProfDataEndianness(Endianness);
   }
   ~InstrProfReaderIndex() override {}
+  uint64_t getVersion() const override { return FormatVersion; }
 };
 
 /// Reader for the indexed binary instrprof format.
@@ -319,6 +322,7 @@ class IndexedInstrProfReader : public InstrProfReader {
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
 public:
+  uint64_t getVersion() const { return Index->getVersion(); }
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
 
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index 1fb2cf6e0ca4..9762813264f4 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -267,6 +267,10 @@ class SampleProfileReader {
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
   create(StringRef Filename, LLVMContext &C);
 
+  /// \brief Create a sample profile reader from the supplied memory buffer.
+  static ErrorOr<std::unique_ptr<SampleProfileReader>>
+  create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C);
+
 protected:
   /// \brief Map every function to its associated profile.
   ///
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index d1cd506d157c..029dd2ebacb0 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -29,9 +29,6 @@ enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
 /// \brief Sample-based profile writer. Base class.
 class SampleProfileWriter {
 public:
-  SampleProfileWriter(StringRef Filename, std::error_code &EC,
-                      sys::fs::OpenFlags Flags)
-      : OS(Filename, EC, Flags) {}
   virtual ~SampleProfileWriter() {}
 
   /// Write sample profiles in \p S for function \p FName.
@@ -55,30 +52,40 @@ class SampleProfileWriter {
     return sampleprof_error::success;
   }
 
+  raw_ostream &getOutputStream() { return *OutputStream; }
+
   /// Profile writer factory.
   ///
-  /// Create a new writer based on the value of \p Format.
+  /// Create a new file writer based on the value of \p Format.
   static ErrorOr<std::unique_ptr<SampleProfileWriter>>
   create(StringRef Filename, SampleProfileFormat Format);
 
+  /// Create a new stream writer based on the value of \p Format.
+  /// For testing.
+  static ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  create(std::unique_ptr<raw_ostream> &OS, SampleProfileFormat Format);
+
 protected:
+  SampleProfileWriter(std::unique_ptr<raw_ostream> &OS)
+      : OutputStream(std::move(OS)) {}
+
   /// \brief Write a file header for the profile file.
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) = 0;
 
   /// \brief Output stream where to emit the profile to.
-  raw_fd_ostream OS;
+  std::unique_ptr<raw_ostream> OutputStream;
 };
 
 /// \brief Sample-based profile writer (text format).
 class SampleProfileWriterText : public SampleProfileWriter {
 public:
-  SampleProfileWriterText(StringRef F, std::error_code &EC)
-      : SampleProfileWriter(F, EC, sys::fs::F_Text), Indent(0) {}
-
   std::error_code write(StringRef FName, const FunctionSamples &S) override;
 
 protected:
+  SampleProfileWriterText(std::unique_ptr<raw_ostream> &OS)
+      : SampleProfileWriter(OS), Indent(0) {}
+
   std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override {
     return sampleprof_error::success;
@@ -89,17 +96,21 @@ class SampleProfileWriterText : public SampleProfileWriter {
   ///
   /// This is used when printing inlined callees.
   unsigned Indent;
+
+  friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                              SampleProfileFormat Format);
 };
 
 /// \brief Sample-based profile writer (binary format).
 class SampleProfileWriterBinary : public SampleProfileWriter {
 public:
-  SampleProfileWriterBinary(StringRef F, std::error_code &EC)
-      : SampleProfileWriter(F, EC, sys::fs::F_None), NameTable() {}
-
   std::error_code write(StringRef F, const FunctionSamples &S) override;
 
 protected:
+  SampleProfileWriterBinary(std::unique_ptr<raw_ostream> &OS)
+      : SampleProfileWriter(OS), NameTable() {}
+
   std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
   std::error_code writeNameIdx(StringRef FName);
@@ -110,6 +121,10 @@ class SampleProfileWriterBinary : public SampleProfileWriter {
   void addNames(const FunctionSamples &S);
 
   MapVector<StringRef, uint32_t> NameTable;
+
+  friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                              SampleProfileFormat Format);
 };
 
 } // End namespace sampleprof
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index f9b5cf22f97d..c608736fa956 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -222,6 +222,8 @@ class BumpPtrAllocatorImpl
       // Without this, MemorySanitizer messages for values originated from here
       // will point to the allocation of the entire slab.
       __msan_allocated_memory(AlignedPtr, Size);
+      // Similarly, tell ASan about this space.
+      __asan_unpoison_memory_region(AlignedPtr, Size);
       return AlignedPtr;
     }
 
@@ -229,12 +231,16 @@ class BumpPtrAllocatorImpl
     size_t PaddedSize = Size + Alignment - 1;
     if (PaddedSize > SizeThreshold) {
       void *NewSlab = Allocator.Allocate(PaddedSize, 0);
+      // We own the new slab and don't want anyone reading anyting other than
+      // pieces returned from this method.  So poison the whole slab.
+      __asan_poison_memory_region(NewSlab, PaddedSize);
       CustomSizedSlabs.push_back(std::make_pair(NewSlab, PaddedSize));
 
       uintptr_t AlignedAddr = alignAddr(NewSlab, Alignment);
       assert(AlignedAddr + Size <= (uintptr_t)NewSlab + PaddedSize);
       char *AlignedPtr = (char*)AlignedAddr;
       __msan_allocated_memory(AlignedPtr, Size);
+      __asan_unpoison_memory_region(AlignedPtr, Size);
       return AlignedPtr;
     }
 
@@ -246,13 +252,16 @@ class BumpPtrAllocatorImpl
     char *AlignedPtr = (char*)AlignedAddr;
     CurPtr = AlignedPtr + Size;
     __msan_allocated_memory(AlignedPtr, Size);
+    __asan_unpoison_memory_region(AlignedPtr, Size);
     return AlignedPtr;
   }
 
   // Pull in base class overloads.
   using AllocatorBase<BumpPtrAllocatorImpl>::Allocate;
 
-  void Deallocate(const void * /*Ptr*/, size_t /*Size*/) {}
+  void Deallocate(const void *Ptr, size_t Size) {
+    __asan_poison_memory_region(Ptr, Size);
+  }
 
   // Pull in base class overloads.
   using AllocatorBase<BumpPtrAllocatorImpl>::Deallocate;
@@ -310,6 +319,10 @@ class BumpPtrAllocatorImpl
     size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
 
     void *NewSlab = Allocator.Allocate(AllocatedSlabSize, 0);
+    // We own the new slab and don't want anyone reading anything other than
+    // pieces returned from this method.  So poison the whole slab.
+    __asan_poison_memory_region(NewSlab, AllocatedSlabSize);
+
     Slabs.push_back(NewSlab);
     CurPtr = (char *)(NewSlab);
     End = ((char *)NewSlab) + AllocatedSlabSize;
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 99a02f7ada03..b3416bbfffb6 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -361,8 +361,11 @@
 /// \brief Whether LLVM itself is built with AddressSanitizer instrumentation.
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 # define LLVM_ADDRESS_SANITIZER_BUILD 1
+# include <sanitizer/asan_interface.h>
 #else
 # define LLVM_ADDRESS_SANITIZER_BUILD 0
+# define __asan_poison_memory_region(p, size)
+# define __asan_unpoison_memory_region(p, size)
 #endif
 
 /// \macro LLVM_THREAD_SANITIZER_BUILD
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 8d71353b1670..b63d12e9ff39 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -625,6 +625,7 @@ const char *GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
 ///
 /// \li \a getTag() returns \a DW_TAG_invalid on invalid input.
 /// \li \a getVirtuality() returns \a DW_VIRTUALITY_invalid on invalid input.
+/// \li \a getMacinfo() returns \a DW_MACINFO_invalid on invalid input.
 ///
 /// @{
 unsigned getTag(StringRef TagString);
@@ -632,6 +633,7 @@ unsigned getOperationEncoding(StringRef OperationEncodingString);
 unsigned getVirtuality(StringRef VirtualityString);
 unsigned getLanguage(StringRef LanguageString);
 unsigned getAttributeEncoding(StringRef EncodingString);
+unsigned getMacinfo(StringRef MacinfoString);
 /// @}
 
 /// \brief Returns the symbolic string representing Val when used as a value
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 0d0a2efa7ad2..8111aeebe6ee 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -659,38 +659,34 @@ inline int64_t SignExtend64(uint64_t X, unsigned B) {
 /// representable value of type T.
 template <typename T>
 typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingAdd(T X, T Y, bool &ResultOverflowed) {
+SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
+  bool Dummy;
+  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
   // Hacker's Delight, p. 29
   T Z = X + Y;
-  ResultOverflowed = (Z < X || Z < Y);
-  if (ResultOverflowed)
+  Overflowed = (Z < X || Z < Y);
+  if (Overflowed)
     return std::numeric_limits<T>::max();
   else
     return Z;
 }
 
-/// \brief Add two unsigned integers, X and Y, of type T.
-/// Clamp the result to the maximum representable value of T on overflow.
-template <typename T>
-typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingAdd(T X, T Y) {
-  bool ResultOverflowed;
-  return SaturatingAdd(X, Y, ResultOverflowed);
-}
-
 /// \brief Multiply two unsigned integers, X and Y, of type T.
 /// Clamp the result to the maximum representable value of T on overflow.
 /// ResultOverflowed indicates if the result is larger than the maximum
 /// representable value of type T.
 template <typename T>
 typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
+SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
+  bool Dummy;
+  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
   // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
   // because it fails for uint16_t (where multiplication can have undefined
   // behavior due to promotion to int), and requires a division in addition
   // to the multiplication.
 
-  ResultOverflowed = false;
+  Overflowed = false;
 
   // Log2(Z) would be either Log2Z or Log2Z + 1.
   // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
@@ -702,7 +698,7 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
     return X * Y;
   }
   if (Log2Z > Log2Max) {
-    ResultOverflowed = true;
+    Overflowed = true;
     return Max;
   }
 
@@ -711,7 +707,7 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
   // that on at the end.
   T Z = (X >> 1) * Y;
   if (Z & ~(Max >> 1)) {
-    ResultOverflowed = true;
+    Overflowed = true;
     return Max;
   }
   Z <<= 1;
@@ -721,15 +717,6 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
   return Z;
 }
 
-/// \brief Multiply two unsigned integers, X and Y, of type T.
-/// Clamp the result to the maximum representable value of T on overflow.
-template <typename T>
-typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingMultiply(T X, T Y) {
-  bool ResultOverflowed;
-  return SaturatingMultiply(X, Y, ResultOverflowed);
-}
-
 extern const float huge_valf;
 } // End llvm namespace
 
diff --git a/include/llvm/Support/Printable.h b/include/llvm/Support/Printable.h
new file mode 100644
index 000000000000..5c1b8d5070d4
--- /dev/null
+++ b/include/llvm/Support/Printable.h
@@ -0,0 +1,52 @@
+//===--- Printable.h - Print function helpers -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the Printable struct.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PRINTABLE_H
+#define LLVM_SUPPORT_PRINTABLE_H
+
+#include <functional>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// Simple wrapper around std::function<void(raw_ostream&)>.
+/// This class is usefull to construct print helpers for raw_ostream.
+///
+/// Example:
+///     Printable PrintRegister(unsigned Register) {
+///       return Printable([Register](raw_ostream &OS) {
+///         OS << getRegisterName(Register);
+///       }
+///     }
+///     ... OS << PrintRegister(Register); ...
+///
+/// Implementation note: Ideally this would just be a typedef, but doing so
+/// leads to operator << being ambiguous as function has matching constructors
+/// in some STL versions. I have seen the problem on gcc 4.6 libstdc++ and
+/// microsoft STL.
+class Printable {
+public:
+  std::function<void(raw_ostream &OS)> Print;
+  Printable(const std::function<void(raw_ostream &OS)> Print)
+    : Print(Print) {}
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const Printable &P) {
+  P.Print(OS);
+  return OS;
+}
+
+}
+
+#endif
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index 2cb8f3c719fa..bbea97b289a6 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -119,7 +119,7 @@ namespace llvm {
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
-      return iterator_range<iterator>(begin(), end());
+      return make_range(begin(), end());
     }
 
     /// Abstract base class for registry listeners, which are informed when new
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 819458dbb0f0..f7152565f914 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -83,7 +83,7 @@ class TargetLoweringBase {
 public:
   /// This enum indicates whether operations are valid for a target, and if not,
   /// what action should be used to make them valid.
-  enum LegalizeAction {
+  enum LegalizeAction : uint8_t {
     Legal,      // The target natively supports this operation.
     Promote,    // This operation should be executed in a larger type.
     Expand,     // Try to expand this to other ops, otherwise use a libcall.
@@ -93,11 +93,12 @@ class TargetLoweringBase {
 
   /// This enum indicates whether a types are legal for a target, and if not,
   /// what action should be used to make them valid.
-  enum LegalizeTypeAction {
+  enum LegalizeTypeAction : uint8_t {
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
-    TypeSoftenFloat,     // Convert this float to a same size integer type.
+    TypeSoftenFloat,     // Convert this float to a same size integer type,
+                         // if an operation is not supported in target HW.
     TypeExpandFloat,     // Split this float into two of half the size.
     TypeScalarizeVector, // Replace this one-element vector with its element.
     TypeSplitVector,     // Split this vector into two of half the size.
@@ -412,20 +413,20 @@ class TargetLoweringBase {
   class ValueTypeActionImpl {
     /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
     /// that indicates how instruction selection should deal with the type.
-    uint8_t ValueTypeActions[MVT::LAST_VALUETYPE];
+    LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];
 
   public:
     ValueTypeActionImpl() {
-      std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions), 0);
+      std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
+                TypeLegal);
     }
 
     LegalizeTypeAction getTypeAction(MVT VT) const {
-      return (LegalizeTypeAction)ValueTypeActions[VT.SimpleTy];
+      return ValueTypeActions[VT.SimpleTy];
     }
 
     void setTypeAction(MVT VT, LegalizeTypeAction Action) {
-      unsigned I = VT.SimpleTy;
-      ValueTypeActions[I] = Action;
+      ValueTypeActions[VT.SimpleTy] = Action;
     }
   };
 
@@ -549,8 +550,7 @@ class TargetLoweringBase {
     // If a target-specific SDNode requires legalization, require the target
     // to provide custom legalization for it.
     if (Op > array_lengthof(OpActions[0])) return Custom;
-    unsigned I = (unsigned) VT.getSimpleVT().SimpleTy;
-    return (LegalizeAction)OpActions[I][Op];
+    return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
   /// Return true if the specified operation is legal on this target or can be
@@ -594,7 +594,7 @@ class TargetLoweringBase {
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
            MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
-    return (LegalizeAction)LoadExtActions[ValI][MemI][ExtType];
+    return LoadExtActions[ValI][MemI][ExtType];
   }
 
   /// Return true if the specified load with extension is legal on this target.
@@ -620,7 +620,7 @@ class TargetLoweringBase {
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
-    return (LegalizeAction)TruncStoreActions[ValI][MemI];
+    return TruncStoreActions[ValI][MemI];
   }
 
   /// Return true if the specified store with truncation is legal on this
@@ -1292,7 +1292,7 @@ class TargetLoweringBase {
 
   /// Remove all register classes.
   void clearRegisterClasses() {
-    memset(RegClassForVT, 0,MVT::LAST_VALUETYPE * sizeof(TargetRegisterClass*));
+    std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
 
     AvailableRegClasses.clear();
   }
@@ -1315,7 +1315,7 @@ class TargetLoweringBase {
   void setOperationAction(unsigned Op, MVT VT,
                           LegalizeAction Action) {
     assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
-    OpActions[(unsigned)VT.SimpleTy][Op] = (uint8_t)Action;
+    OpActions[(unsigned)VT.SimpleTy][Op] = Action;
   }
 
   /// Indicate that the specified load with extension does not work with the
@@ -1324,7 +1324,7 @@ class TargetLoweringBase {
                         LegalizeAction Action) {
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
            MemVT.isValid() && "Table isn't big enough!");
-    LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy][ExtType] = (uint8_t)Action;
+    LoadExtActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy][ExtType] = Action;
   }
 
   /// Indicate that the specified truncating store does not work with the
@@ -1332,7 +1332,7 @@ class TargetLoweringBase {
   void setTruncStoreAction(MVT ValVT, MVT MemVT,
                            LegalizeAction Action) {
     assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
-    TruncStoreActions[ValVT.SimpleTy][MemVT.SimpleTy] = (uint8_t)Action;
+    TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
   }
 
   /// Indicate that the specified indexed load does or does not work with the
@@ -1885,17 +1885,17 @@ class TargetLoweringBase {
   /// operations are Legal (aka, supported natively by the target), but
   /// operations that are not should be described.  Note that operations on
   /// non-legal value types are not described here.
-  uint8_t OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
+  LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
 
   /// For each load extension type and each value type, keep a LegalizeAction
   /// that indicates how instruction selection should deal with a load of a
   /// specific value type and extension type.
-  uint8_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]
-                        [ISD::LAST_LOADEXT_TYPE];
+  LegalizeAction LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]
+                               [ISD::LAST_LOADEXT_TYPE];
 
   /// For each value type pair keep a LegalizeAction that indicates whether a
   /// truncating store of a specific value type and truncating type is legal.
-  uint8_t TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
+  LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
 
   /// For each indexed mode and each value type, keep a pair of LegalizeAction
   /// that indicates how instruction selection should deal with the load /
@@ -1913,6 +1913,7 @@ class TargetLoweringBase {
   /// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
 
+protected:
   ValueTypeActionImpl ValueTypeActions;
 
 private:
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b7760a61806f..74e91b5790cb 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -255,6 +255,13 @@ class TargetMachine {
     return true;
   }
 
+  /// True if subtarget inserts the final scheduling pass on its own.
+  ///
+  /// Branch relaxation, which must happen after block placement, can
+  /// on some targets (e.g. SystemZ) expose additional post-RA
+  /// scheduling opportunities.
+  virtual bool targetSchedulesPostRAScheduling() const { return false; };
+
   void getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV,
                          Mangler &Mang, bool MayAlwaysUsePrivate = false) const;
   MCSymbol *getSymbol(const GlobalValue *GV, Mangler &Mang) const;
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index e8926f788156..414255edb23e 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Printable.h"
 #include <cassert>
 #include <functional>
 
@@ -614,9 +615,13 @@ class TargetRegisterInfo : public MCRegisterInfo {
 
   /// Find the largest common subclass of A and B.
   /// Return NULL if there is no common subclass.
+  /// The common subclass should contain
+  /// simple value type SVT if it is not the Any type.
   const TargetRegisterClass *
   getCommonSubClass(const TargetRegisterClass *A,
-                    const TargetRegisterClass *B) const;
+                    const TargetRegisterClass *B,
+                    const MVT::SimpleValueType SVT =
+                    MVT::SimpleValueType::Any) const;
 
   /// Returns a TargetRegisterClass used for pointer values.
   /// If a target supports multiple different pointer register classes,
@@ -928,7 +933,6 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
   }
 };
 
-/// Helper class for printing registers on a raw_ostream.
 /// Prints virtual and physical registers with or without a TRI instance.
 ///
 /// The format is:
@@ -939,24 +943,10 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
 ///   %physreg17      - a physical register when no TRI instance given.
 ///
 /// Usage: OS << PrintReg(Reg, TRI) << '\n';
-///
-class PrintReg {
-  const TargetRegisterInfo *TRI;
-  unsigned Reg;
-  unsigned SubIdx;
-public:
-  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = nullptr,
-                    unsigned subidx = 0)
-    : TRI(tri), Reg(reg), SubIdx(subidx) {}
-  void print(raw_ostream&) const;
-};
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
+                   unsigned SubRegIdx = 0);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing register units on a raw_ostream.
+/// Create Printable object to print register units on a \ref raw_ostream.
 ///
 /// Register units are named after their root registers:
 ///
@@ -964,54 +954,14 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
 ///   FP0~ST7 - Dual roots.
 ///
 /// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
-///
-class PrintRegUnit {
-protected:
-  const TargetRegisterInfo *TRI;
-  unsigned Unit;
-public:
-  PrintRegUnit(unsigned unit, const TargetRegisterInfo *tri)
-    : TRI(tri), Unit(unit) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
 
-/// It is often convenient to track virtual registers and
-/// physical register units in the same list.
-class PrintVRegOrUnit : protected PrintRegUnit {
-public:
-  PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *tri)
-    : PrintRegUnit(VRegOrUnit, tri) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS,
-                                      const PrintVRegOrUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing lane masks.
-///
-/// They are currently printed out as hexadecimal numbers.
-/// Usage: OS << PrintLaneMask(Mask);
-class PrintLaneMask {
-protected:
-  LaneBitmask LaneMask;
-public:
-  PrintLaneMask(LaneBitmask LaneMask)
-    : LaneMask(LaneMask) {}
-  void print(raw_ostream&) const;
-};
+/// \brief Create Printable object to print virtual registers and physical
+/// registers on a \ref raw_ostream.
+Printable PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMask &P) {
-  P.print(OS);
-  return OS;
-}
+/// Create Printable object to print LaneBitmasks on a \ref raw_ostream.
+Printable PrintLaneMask(LaneBitmask LaneMask);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index 53db5aa84292..a7143ac3fa66 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
-/// TargetSelectionDAGInfo - Targets can subclass this to parameterize the
+/// Targets can subclass this to parameterize the
 /// SelectionDAG lowering and instruction selection process.
 ///
 class TargetSelectionDAGInfo {
@@ -32,8 +32,8 @@ class TargetSelectionDAGInfo {
   explicit TargetSelectionDAGInfo() = default;
   virtual ~TargetSelectionDAGInfo();
 
-  /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
-  /// memcpy. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memcpy.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple loads/stores and can be
   /// more efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -56,8 +56,8 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemmove - Emit target-specific code that performs a
-  /// memmove. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memmove.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple loads/stores and can be
   /// more efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -72,8 +72,8 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemset - Emit target-specific code that performs a
-  /// memset. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memset.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple stores and can be more
   /// efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -87,11 +87,10 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
-  /// memcmp, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the memcmp and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a memcmp, in cases where that is
+  /// faster than a libcall. The first returned SDValue is the result of the
+  /// memcmp and the second is the chain. Both SDValues can be null if a normal
+  /// libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,
@@ -101,11 +100,10 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForMemchr - Emit target-specific code that performs a
-  /// memchr, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the memchr and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a memchr, in cases where that is
+  /// faster than a libcall. The first returned SDValue is the result of the
+  /// memchr and the second is the chain. Both SDValues can be null if a normal
+  /// libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                           SDValue Src, SDValue Char, SDValue Length,
@@ -113,8 +111,8 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForStrcpy - Emit target-specific code that performs a
-  /// strcpy or stpcpy, in cases where that is faster than a libcall.
+  /// Emit target-specific code that performs a strcpy or stpcpy, in cases
+  /// where that is faster than a libcall.
   /// The first returned SDValue is the result of the copy (the start
   /// of the destination string for strcpy, a pointer to the null terminator
   /// for stpcpy) and the second is the chain.  Both SDValues can be null
@@ -128,11 +126,10 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForStrcmp - Emit target-specific code that performs a
-  /// strcmp, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the strcmp and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a strcmp, in cases where that is
+  /// faster than a libcall.
+  /// The first returned SDValue is the result of the strcmp and the second is
+  /// the chain. Both SDValues can be null if a normal libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 96ddc6eceed2..38b8ffdd9fc9 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+class FunctionInfoIndex;
 class ModulePass;
 class Pass;
 class Function;
@@ -85,6 +86,10 @@ ModulePass *createEliminateAvailableExternallyPass();
 ModulePass *createGVExtractionPass(std::vector<GlobalValue*>& GVs, bool
                                    deleteFn = false);
 
+//===----------------------------------------------------------------------===//
+/// This pass performs iterative function importing from other modules.
+Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr);
+
 //===----------------------------------------------------------------------===//
 /// createFunctionInliningPass - Return a new pass object that uses a heuristic
 /// to inline direct function calls to small functions.
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index 0315c72811c1..e3da8a79c23e 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -18,23 +18,6 @@ class LLVMContext;
 class Module;
 class FunctionInfoIndex;
 
-/// Helper to load on demand a Module from file and cache it for subsequent
-/// queries. It can be used with the FunctionImporter.
-class ModuleLazyLoaderCache {
-  /// The context that will be used for importing.
-  LLVMContext &Context;
-
-  /// Cache of lazily loaded module for import.
-  StringMap<std::unique_ptr<Module>> ModuleMap;
-
-public:
-  /// Create the loader, Module will be initialized in \p Context.
-  ModuleLazyLoaderCache(LLVMContext &Context) : Context(Context) {}
-
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  Module &operator()(StringRef FileName);
-};
-
 /// The function importer is automatically importing function from other modules
 /// based on the provided summary informations.
 class FunctionImporter {
@@ -45,16 +28,17 @@ class FunctionImporter {
   /// Diagnostic will be sent to this handler.
   DiagnosticHandlerFunction DiagnosticHandler;
 
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  std::function<Module &(StringRef FileName)> getLazyModule;
+  /// Factory function to load a Module for a given identifier
+  std::function<std::unique_ptr<Module>(StringRef Identifier)> ModuleLoader;
 
 public:
   /// Create a Function Importer.
-  FunctionImporter(const FunctionInfoIndex &Index,
-                   DiagnosticHandlerFunction DiagnosticHandler,
-                   std::function<Module &(StringRef FileName)> ModuleLoader)
+  FunctionImporter(
+      const FunctionInfoIndex &Index,
+      DiagnosticHandlerFunction DiagnosticHandler,
+      std::function<std::unique_ptr<Module>(StringRef Identifier)> ModuleLoader)
       : Index(Index), DiagnosticHandler(DiagnosticHandler),
-        getLazyModule(ModuleLoader) {}
+        ModuleLoader(ModuleLoader) {}
 
   /// Import functions in Module \p M based on the summary informations.
   bool importFunctions(Module &M);
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index ef01fa350531..a4e7bce8ef4a 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -15,9 +15,11 @@
 #ifndef LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
 #define LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
 
+#include <memory>
 #include <vector>
 
 namespace llvm {
+class FunctionInfoIndex;
 class Pass;
 class TargetLibraryInfoImpl;
 class TargetMachine;
@@ -114,6 +116,9 @@ class PassManagerBuilder {
   /// added to the per-module passes.
   Pass *Inliner;
 
+  /// The function summary index to use for function importing.
+  const FunctionInfoIndex *FunctionIndex;
+
   bool DisableTailCalls;
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index c8863481c34c..38dfeb04ace3 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -79,6 +79,11 @@ struct GCOVOptions {
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
+// PGO Instrumention
+ModulePass *createPGOInstrumentationGenPass();
+ModulePass *
+createPGOInstrumentationUsePass(StringRef Filename = StringRef(""));
+
 /// Options for the frontend instrumentation based profiling pass.
 struct InstrProfOptions {
   InstrProfOptions() : NoRedZone(false) {}
@@ -149,6 +154,24 @@ FunctionPass *createBoundsCheckingPass();
 /// protect against stack-based overflow vulnerabilities.
 FunctionPass *createSafeStackPass(const TargetMachine *TM = nullptr);
 
+/// \brief Calculate what to divide by to scale counts.
+///
+/// Given the maximum count, calculate a divisor that will scale all the
+/// weights to strictly less than UINT32_MAX.
+static inline uint64_t calculateCountScale(uint64_t MaxCount) {
+  return MaxCount < UINT32_MAX ? 1 : MaxCount / UINT32_MAX + 1;
+}
+
+/// \brief Scale an individual branch count.
+///
+/// Scale a 64-bit weight down to 32-bits using \c Scale.
+///
+static inline uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) {
+  uint64_t Scaled = Count / Scale;
+  assert(Scaled <= UINT32_MAX && "overflow 32-bits");
+  return Scaled;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 2fccbb802964..d1a5fdeaddc4 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -48,16 +48,16 @@ class AllocaInst;
 class AssumptionCacheTracker;
 class DominatorTree;
 
-/// CloneModule - Return an exact copy of the specified module
+/// Return an exact copy of the specified module
 ///
-Module *CloneModule(const Module *M);
-Module *CloneModule(const Module *M, ValueToValueMapTy &VMap);
+std::unique_ptr<Module> CloneModule(const Module *M);
+std::unique_ptr<Module> CloneModule(const Module *M, ValueToValueMapTy &VMap);
 
 /// Return a copy of the specified module. The ShouldCloneDefinition function
 /// controls whether a specific GlobalValue's definition is cloned. If the
 /// function returns false, the module copy will contain an external reference
 /// in place of the global definition.
-Module *
+std::unique_ptr<Module>
 CloneModule(const Module *M, ValueToValueMapTy &VMap,
             std::function<bool(const GlobalValue *)> ShouldCloneDefinition);
 
@@ -75,8 +75,9 @@ struct ClonedCodeInfo {
   bool ContainsDynamicAllocas;
 
   /// All cloned call sites that have operand bundles attached are appended to
-  /// this vector.
-  std::vector<AssertingVH<Instruction>> OperandBundleCallSites;
+  /// this vector.  This vector may contain nulls if some of the originally
+  /// inserted callsites were DCE'ed after they were cloned.
+  std::vector<WeakVH> OperandBundleCallSites;
 
   ClonedCodeInfo() : ContainsCalls(false), ContainsDynamicAllocas(false) {}
 };
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 26f2e7ff504a..cebc8731d4d3 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -115,11 +115,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// inlining has the given attribute set either at the call site or the
   /// function declaration.  Primarily used to inspect call site specific
   /// attributes since these can be more precise than the ones on the callee
-  /// itself. 
+  /// itself.
   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
   
   /// Return true if the given value is known non null within the callee if
-  /// inlined through this particular callsite. 
+  /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
   // Custom analysis routines.
@@ -834,8 +834,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
-    // bonus we want to apply, but don't go below zero.
-    Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
+    // threshold to get the bonus we want to apply, but don't go below zero.
+    Cost -= std::max(0, CA.getThreshold() - CA.getCost());
   }
 
   return Base::visitCallSite(CS);
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 1d50e98c0fdd..0d1d34e0cb4f 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -105,7 +105,12 @@ class LVILatticeVal {
     Res.markConstantRange(CR);
     return Res;
   }
-
+  static LVILatticeVal getOverdefined() {
+    LVILatticeVal Res;
+    Res.markOverdefined();
+    return Res;
+  }
+  
   bool isUndefined() const     { return Tag == undefined; }
   bool isConstant() const      { return Tag == constant; }
   bool isNotConstant() const   { return Tag == notconstant; }
@@ -316,6 +321,8 @@ namespace {
     /// This is all of the cached block information for exactly one Value*.
     /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
+    /// Over-defined lattice values are recorded in OverDefinedCache to reduce
+    /// memory overhead.
     typedef SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4>
         ValueCacheEntryTy;
 
@@ -324,8 +331,7 @@ namespace {
     std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
 
     /// This tracks, on a per-block basis, the set of values that are
-    /// over-defined at the end of that block.  This is required
-    /// for cache updating.
+    /// over-defined at the end of that block.
     typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>>
         OverDefinedCacheTy;
     OverDefinedCacheTy OverDefinedCache;
@@ -360,9 +366,13 @@ namespace {
 
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
       SeenBlocks.insert(BB);
-      lookup(Val)[BB] = Result;
+
+      // Insert over-defined values into their own cache to reduce memory
+      // overhead.
       if (Result.isOverdefined())
         OverDefinedCache[BB].insert(Val);
+      else
+        lookup(Val)[BB] = Result;
     }
 
     LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
@@ -390,6 +400,34 @@ namespace {
       return ValueCache[LVIValueHandle(V, this)];
     }
 
+    bool isOverdefined(Value *V, BasicBlock *BB) const {
+      auto ODI = OverDefinedCache.find(BB);
+
+      if (ODI == OverDefinedCache.end())
+        return false;
+
+      return ODI->second.count(V);
+    }
+
+    bool hasCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return true;
+
+      LVIValueHandle ValHandle(V, this);
+      auto I = ValueCache.find(ValHandle);
+      if (I == ValueCache.end())
+        return false;
+
+      return I->second.count(BB);
+    }
+
+    LVILatticeVal getCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return LVILatticeVal::getOverdefined();
+
+      return lookup(V)[BB];
+    }
+    
   public:
     /// This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
@@ -467,7 +505,8 @@ void LazyValueInfoCache::solve() {
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
       assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
-      assert(lookup(e.second).count(e.first) && "Result should be in cache!");
+      assert(hasCachedValueInfo(e.second, e.first) &&
+             "Result should be in cache!");
 
       BlockValueStack.pop();
       BlockValueSet.erase(e);
@@ -483,10 +522,7 @@ bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  LVIValueHandle ValHandle(Val, this);
-  auto I = ValueCache.find(ValHandle);
-  if (I == ValueCache.end()) return false;
-  return I->second.count(BB);
+  return hasCachedValueInfo(Val, BB);
 }
 
 LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
@@ -495,7 +531,7 @@ LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
     return LVILatticeVal::get(VC);
 
   SeenBlocks.insert(BB);
-  return lookup(Val)[BB];
+  return getCachedValueInfo(Val, BB);
 }
 
 static LVILatticeVal getFromRangeMetadata(Instruction *BBI) {
@@ -521,10 +557,10 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  if (lookup(Val).count(BB)) {
+  if (hasCachedValueInfo(Val, BB)) {
     // If we have a cached value, use that.
     DEBUG(dbgs() << "  reuse BB '" << BB->getName()
-                 << "' val=" << lookup(Val)[BB] << '\n');
+                 << "' val=" << getCachedValueInfo(Val, BB) << '\n');
 
     // Since we're reusing a cached value, we don't need to update the
     // OverDefinedCache. The cache will have been properly updated whenever the
@@ -1106,12 +1142,6 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
       if (!ValueSet.count(V))
         continue;
 
-      // Remove it from the caches.
-      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(V, this)];
-      ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
-
-      assert(CI != Entry.end() && "Couldn't find entry to update?");
-      Entry.erase(CI);
       ValueSet.erase(V);
       if (ValueSet.empty())
         OverDefinedCache.erase(OI);
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 49b28078c976..ce6a5ab5656d 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -87,11 +87,10 @@ Value *llvm::stripIntegerCast(Value *V) {
   return V;
 }
 
-const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                             const ValueToValueMap &PtrToStride,
-                                            SCEVUnionPredicate &Preds,
                                             Value *Ptr, Value *OrigPtr) {
-  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
+  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
@@ -108,16 +107,17 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
     ValueToValueMap RewriteMap;
     RewriteMap[StrideVal] = One;
 
+    ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
         static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
 
-    Preds.add(SE->getEqualPredicate(U, CT));
+    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
+    auto *Expr = PSE.getSCEV(Ptr);
 
-    const SCEV *ByOne = SE->rewriteUsingPredicate(OrigSCEV, Preds);
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
                  << "\n");
-    return ByOne;
+    return Expr;
   }
 
   // Otherwise, just return the SCEV of the original pointer.
@@ -127,11 +127,12 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
                                     const ValueToValueMap &Strides,
-                                    SCEVUnionPredicate &Preds) {
+                                    PredicatedScalarEvolution &PSE) {
   // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
 
   const SCEV *ScStart = AR->getStart();
@@ -289,7 +290,7 @@ void RuntimePointerChecking::groupChecks(
   // don't process them twice.
   SmallSet<unsigned, 2> Seen;
 
-  // Go through all equivalence classes, get the the "pointer check groups"
+  // Go through all equivalence classes, get the "pointer check groups"
   // and add them to the overall solution. We use the order in which accesses
   // appear in 'Pointers' to enforce determinism.
   for (unsigned I = 0; I < Pointers.size(); ++I) {
@@ -423,9 +424,10 @@ class AccessAnalysis {
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA, SCEVUnionPredicate &Preds)
+                 MemoryDepChecker::DepCandidates &DA,
+                 PredicatedScalarEvolution &PSE)
       : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
-        Preds(Preds) {}
+        PSE(PSE) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -512,16 +514,16 @@ class AccessAnalysis {
   bool IsRTCheckAnalysisNeeded;
 
   /// The SCEV predicate containing all the SCEV-related assumptions.
-  SCEVUnionPredicate &Preds;
+  PredicatedScalarEvolution &PSE;
 };
 
 } // end anonymous namespace
 
 /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE,
+static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
                                 const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L, SCEVUnionPredicate &Preds) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+                                Loop *L) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -564,11 +566,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       else
         ++NumReadPtrChecks;
 
-      if (hasComputableBounds(SE, StridesMap, Ptr, TheLoop, Preds) &&
+      if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
           (!ShouldCheckStride ||
-           isStridedPtr(SE, Ptr, TheLoop, StridesMap, Preds) == 1)) {
+           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -582,7 +584,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, Preds);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -817,9 +819,8 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                       const ValueToValueMap &StridesMap,
-                       SCEVUnionPredicate &Preds) {
+int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
+                       const Loop *Lp, const ValueToValueMap &StridesMap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
@@ -831,7 +832,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
     return 0;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Preds, Ptr);
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR) {
@@ -854,16 +855,16 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
     DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
+                 << *Ptr << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
 
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -1046,11 +1047,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, BPtr);
+  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
 
-  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides, Preds);
-  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides, Preds);
+  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
 
   const SCEV *Src = AScev;
   const SCEV *Sink = BScev;
@@ -1067,12 +1068,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     std::swap(StrideAPtr, StrideBPtr);
   }
 
-  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
 
   DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-        << "(Induction step: " << StrideAPtr <<  ")\n");
+               << "(Induction step: " << StrideAPtr << ")\n");
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
-        << *InstMap[BIdx] << ": " << *Dist << "\n");
+               << *InstMap[BIdx] << ": " << *Dist << "\n");
 
   // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
@@ -1343,10 +1344,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(LoopAccessReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -1447,7 +1448,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, Preds);
+                          AA, LI, DependentAccesses, PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1498,8 +1499,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
-        !isStridedPtr(SE, Ptr, TheLoop, Strides, Preds)) {
+    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1529,7 +1529,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1556,6 +1556,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       PtrRtChecking.reset();
       PtrRtChecking.Need = true;
 
+      auto *SE = PSE.getSE();
       CanDoRTIfNeeded =
           Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
@@ -1598,7 +1599,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1679,7 +1680,7 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Instruction *Loc,
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
     const {
-
+  auto *SE = PSE.getSE();
   SCEVExpander Exp(*SE, DL, "induction");
   auto ExpandedChecks =
       expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
@@ -1749,7 +1750,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : PtrRtChecking(SE), DepChecker(SE, L, Preds), TheLoop(L), SE(SE), DL(DL),
+    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1786,7 +1787,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  Preds.print(OS, Depth);
+  PSE.getUnionPredicate().print(OS, Depth);
 }
 
 const LoopAccessInfo &
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index e679b7ad7b86..67a82b192e56 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -200,6 +200,15 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
   return true;
 }
 
+bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT) const {
+  if (!isLCSSAForm(DT))
+    return false;
+
+  return std::all_of(begin(), end(), [&](const Loop *L) {
+    return L->isRecursivelyLCSSAForm(DT);
+  });
+}
+
 /// isLoopSimplifyForm - Return true if the Loop is in the form that
 /// the LoopSimplify form transforms loops to, which is sometimes called
 /// normal form.
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b4dce4941538..c64be771f1f0 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -62,6 +62,14 @@ static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
   {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::msvc_new_int,         OpNewLike,   1, 0,  -1}, // new(unsigned int)
+  {LibFunc::msvc_new_int_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::msvc_new_longlong,         OpNewLike,   1, 0,  -1}, // new(unsigned long long)
+  {LibFunc::msvc_new_longlong_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned long long, nothrow)
+  {LibFunc::msvc_new_array_int,         OpNewLike,   1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::msvc_new_array_int_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::msvc_new_array_longlong,         OpNewLike,   1, 0,  -1}, // new[](unsigned long long)
+  {LibFunc::msvc_new_array_longlong_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned long long, nothrow)
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
@@ -308,14 +316,26 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   unsigned ExpectedNumParams;
   if (TLIFn == LibFunc::free ||
       TLIFn == LibFunc::ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc::ZdaPv)   // operator delete[](void*)
+      TLIFn == LibFunc::ZdaPv || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr64)   // operator delete[](void*)
     ExpectedNumParams = 1;
   else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
            TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
            TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
            TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
            TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
+           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr32_int ||      // delete(void*, uint)
+           TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
     return nullptr;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 9a0570d47f02..1c2fb3d1ed02 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -446,179 +446,179 @@ bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
-  /// than the complexity of the RHS.  This comparator is used to canonicalize
-  /// expressions.
-  class SCEVComplexityCompare {
-    const LoopInfo *const LI;
-  public:
-    explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
-
-    // Return true or false if LHS is less than, or at least RHS, respectively.
-    bool operator()(const SCEV *LHS, const SCEV *RHS) const {
-      return compare(LHS, RHS) < 0;
-    }
-
-    // Return negative, zero, or positive, if LHS is less than, equal to, or
-    // greater than RHS, respectively. A three-way result allows recursive
-    // comparisons to be more efficient.
-    int compare(const SCEV *LHS, const SCEV *RHS) const {
-      // Fast-path: SCEVs are uniqued so we can do a quick equality check.
-      if (LHS == RHS)
-        return 0;
-
-      // Primarily, sort the SCEVs by their getSCEVType().
-      unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
-      if (LType != RType)
-        return (int)LType - (int)RType;
-
-      // Aside from the getSCEVType() ordering, the particular ordering
-      // isn't very important except that it's beneficial to be consistent,
-      // so that (a + b) and (b + a) don't end up as different expressions.
-      switch (static_cast<SCEVTypes>(LType)) {
-      case scUnknown: {
-        const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
-        const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
-
-        // Sort SCEVUnknown values with some loose heuristics. TODO: This is
-        // not as complete as it could be.
-        const Value *LV = LU->getValue(), *RV = RU->getValue();
-
-        // Order pointer values after integer values. This helps SCEVExpander
-        // form GEPs.
-        bool LIsPointer = LV->getType()->isPointerTy(),
-             RIsPointer = RV->getType()->isPointerTy();
-        if (LIsPointer != RIsPointer)
-          return (int)LIsPointer - (int)RIsPointer;
-
-        // Compare getValueID values.
-        unsigned LID = LV->getValueID(),
-                 RID = RV->getValueID();
-        if (LID != RID)
-          return (int)LID - (int)RID;
-
-        // Sort arguments by their position.
-        if (const Argument *LA = dyn_cast<Argument>(LV)) {
-          const Argument *RA = cast<Argument>(RV);
-          unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
-          return (int)LArgNo - (int)RArgNo;
-        }
-
-        // For instructions, compare their loop depth, and their operand
-        // count.  This is pretty loose.
-        if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
-          const Instruction *RInst = cast<Instruction>(RV);
-
-          // Compare loop depths.
-          const BasicBlock *LParent = LInst->getParent(),
-                           *RParent = RInst->getParent();
-          if (LParent != RParent) {
-            unsigned LDepth = LI->getLoopDepth(LParent),
-                     RDepth = LI->getLoopDepth(RParent);
-            if (LDepth != RDepth)
-              return (int)LDepth - (int)RDepth;
-          }
-
-          // Compare the number of operands.
-          unsigned LNumOps = LInst->getNumOperands(),
-                   RNumOps = RInst->getNumOperands();
-          return (int)LNumOps - (int)RNumOps;
-        }
+/// SCEVComplexityCompare - Return true if the complexity of the LHS is less
+/// than the complexity of the RHS.  This comparator is used to canonicalize
+/// expressions.
+class SCEVComplexityCompare {
+  const LoopInfo *const LI;
+public:
+  explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
 
-        return 0;
-      }
+  // Return true or false if LHS is less than, or at least RHS, respectively.
+  bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+    return compare(LHS, RHS) < 0;
+  }
 
-      case scConstant: {
-        const SCEVConstant *LC = cast<SCEVConstant>(LHS);
-        const SCEVConstant *RC = cast<SCEVConstant>(RHS);
-
-        // Compare constant values.
-        const APInt &LA = LC->getValue()->getValue();
-        const APInt &RA = RC->getValue()->getValue();
-        unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
-        if (LBitWidth != RBitWidth)
-          return (int)LBitWidth - (int)RBitWidth;
-        return LA.ult(RA) ? -1 : 1;
+  // Return negative, zero, or positive, if LHS is less than, equal to, or
+  // greater than RHS, respectively. A three-way result allows recursive
+  // comparisons to be more efficient.
+  int compare(const SCEV *LHS, const SCEV *RHS) const {
+    // Fast-path: SCEVs are uniqued so we can do a quick equality check.
+    if (LHS == RHS)
+      return 0;
+
+    // Primarily, sort the SCEVs by their getSCEVType().
+    unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+    if (LType != RType)
+      return (int)LType - (int)RType;
+
+    // Aside from the getSCEVType() ordering, the particular ordering
+    // isn't very important except that it's beneficial to be consistent,
+    // so that (a + b) and (b + a) don't end up as different expressions.
+    switch (static_cast<SCEVTypes>(LType)) {
+    case scUnknown: {
+      const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
+      const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
+
+      // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+      // not as complete as it could be.
+      const Value *LV = LU->getValue(), *RV = RU->getValue();
+
+      // Order pointer values after integer values. This helps SCEVExpander
+      // form GEPs.
+      bool LIsPointer = LV->getType()->isPointerTy(),
+        RIsPointer = RV->getType()->isPointerTy();
+      if (LIsPointer != RIsPointer)
+        return (int)LIsPointer - (int)RIsPointer;
+
+      // Compare getValueID values.
+      unsigned LID = LV->getValueID(),
+        RID = RV->getValueID();
+      if (LID != RID)
+        return (int)LID - (int)RID;
+
+      // Sort arguments by their position.
+      if (const Argument *LA = dyn_cast<Argument>(LV)) {
+        const Argument *RA = cast<Argument>(RV);
+        unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
+        return (int)LArgNo - (int)RArgNo;
       }
 
-      case scAddRecExpr: {
-        const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
-        const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
-
-        // Compare addrec loop depths.
-        const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
-        if (LLoop != RLoop) {
-          unsigned LDepth = LLoop->getLoopDepth(),
-                   RDepth = RLoop->getLoopDepth();
+      // For instructions, compare their loop depth, and their operand
+      // count.  This is pretty loose.
+      if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
+        const Instruction *RInst = cast<Instruction>(RV);
+
+        // Compare loop depths.
+        const BasicBlock *LParent = LInst->getParent(),
+          *RParent = RInst->getParent();
+        if (LParent != RParent) {
+          unsigned LDepth = LI->getLoopDepth(LParent),
+            RDepth = LI->getLoopDepth(RParent);
           if (LDepth != RDepth)
             return (int)LDepth - (int)RDepth;
         }
 
-        // Addrec complexity grows with operand count.
-        unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
+        // Compare the number of operands.
+        unsigned LNumOps = LInst->getNumOperands(),
+          RNumOps = RInst->getNumOperands();
+        return (int)LNumOps - (int)RNumOps;
+      }
+
+      return 0;
+    }
 
-        // Lexicographically compare.
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          long X = compare(LA->getOperand(i), RA->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+    case scConstant: {
+      const SCEVConstant *LC = cast<SCEVConstant>(LHS);
+      const SCEVConstant *RC = cast<SCEVConstant>(RHS);
 
-        return 0;
+      // Compare constant values.
+      const APInt &LA = LC->getValue()->getValue();
+      const APInt &RA = RC->getValue()->getValue();
+      unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
+      if (LBitWidth != RBitWidth)
+        return (int)LBitWidth - (int)RBitWidth;
+      return LA.ult(RA) ? -1 : 1;
+    }
+
+    case scAddRecExpr: {
+      const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
+      const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
+
+      // Compare addrec loop depths.
+      const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
+      if (LLoop != RLoop) {
+        unsigned LDepth = LLoop->getLoopDepth(),
+          RDepth = RLoop->getLoopDepth();
+        if (LDepth != RDepth)
+          return (int)LDepth - (int)RDepth;
       }
 
-      case scAddExpr:
-      case scMulExpr:
-      case scSMaxExpr:
-      case scUMaxExpr: {
-        const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
-        const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
-
-        // Lexicographically compare n-ary expressions.
-        unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
-
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          if (i >= RNumOps)
-            return 1;
-          long X = compare(LC->getOperand(i), RC->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+      // Addrec complexity grows with operand count.
+      unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
+      if (LNumOps != RNumOps)
         return (int)LNumOps - (int)RNumOps;
+
+      // Lexicographically compare.
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        long X = compare(LA->getOperand(i), RA->getOperand(i));
+        if (X != 0)
+          return X;
       }
 
-      case scUDivExpr: {
-        const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
-        const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
+      return 0;
+    }
+
+    case scAddExpr:
+    case scMulExpr:
+    case scSMaxExpr:
+    case scUMaxExpr: {
+      const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
+      const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
+
+      // Lexicographically compare n-ary expressions.
+      unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+      if (LNumOps != RNumOps)
+        return (int)LNumOps - (int)RNumOps;
 
-        // Lexicographically compare udiv expressions.
-        long X = compare(LC->getLHS(), RC->getLHS());
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        if (i >= RNumOps)
+          return 1;
+        long X = compare(LC->getOperand(i), RC->getOperand(i));
         if (X != 0)
           return X;
-        return compare(LC->getRHS(), RC->getRHS());
       }
+      return (int)LNumOps - (int)RNumOps;
+    }
 
-      case scTruncate:
-      case scZeroExtend:
-      case scSignExtend: {
-        const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
-        const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+    case scUDivExpr: {
+      const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
+      const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
 
-        // Compare cast expressions by operand.
-        return compare(LC->getOperand(), RC->getOperand());
-      }
+      // Lexicographically compare udiv expressions.
+      long X = compare(LC->getLHS(), RC->getLHS());
+      if (X != 0)
+        return X;
+      return compare(LC->getRHS(), RC->getRHS());
+    }
 
-      case scCouldNotCompute:
-        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-      }
-      llvm_unreachable("Unknown SCEV kind!");
+    case scTruncate:
+    case scZeroExtend:
+    case scSignExtend: {
+      const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
+      const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+
+      // Compare cast expressions by operand.
+      return compare(LC->getOperand(), RC->getOperand());
     }
-  };
-}
+
+    case scCouldNotCompute:
+      llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+    }
+    llvm_unreachable("Unknown SCEV kind!");
+  }
+};
+}  // end anonymous namespace
 
 /// GroupByComplexity - Given a list of SCEV objects, order them by their
 /// complexity, and group objects of the same complexity together by value.
@@ -666,24 +666,22 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   }
 }
 
-namespace {
-struct FindSCEVSize {
-  int Size;
-  FindSCEVSize() : Size(0) {}
-
-  bool follow(const SCEV *S) {
-    ++Size;
-    // Keep looking at all operands of S.
-    return true;
-  }
-  bool isDone() const {
-    return false;
-  }
-};
-}
-
 // Returns the size of the SCEV S.
 static inline int sizeOfSCEV(const SCEV *S) {
+  struct FindSCEVSize {
+    int Size;
+    FindSCEVSize() : Size(0) {}
+
+    bool follow(const SCEV *S) {
+      ++Size;
+      // Keep looking at all operands of S.
+      return true;
+    }
+    bool isDone() const {
+      return false;
+    }
+  };
+
   FindSCEVSize F;
   SCEVTraversal<FindSCEVSize> ST(F);
   ST.visitAll(S);
@@ -1929,14 +1927,6 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
   return Interesting;
 }
 
-namespace {
-  struct APIntCompare {
-    bool operator()(const APInt &LHS, const APInt &RHS) const {
-      return LHS.ult(RHS);
-    }
-  };
-}
-
 // We're trying to construct a SCEV of type `Type' with `Ops' as operands and
 // `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
 // can't-overflow flags for the operation if possible.
@@ -2149,6 +2139,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
                                      Ops.data(), Ops.size(),
                                      APInt(BitWidth, 1), *this)) {
+      struct APIntCompare {
+        bool operator()(const APInt &LHS, const APInt &RHS) const {
+          return LHS.ult(RHS);
+        }
+      };
+
       // Some interesting folding opportunity is present, so its worthwhile to
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
@@ -2456,9 +2452,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
           SmallVector<const SCEV *, 4> NewOps;
           bool AnyFolded = false;
-          for (SCEVAddRecExpr::op_iterator I = Add->op_begin(),
-                 E = Add->op_end(); I != E; ++I) {
-            const SCEV *Mul = getMulExpr(Ops[0], *I);
+          for (const SCEV *AddOp : Add->operands()) {
+            const SCEV *Mul = getMulExpr(Ops[0], AddOp);
             if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
             NewOps.push_back(Mul);
           }
@@ -2467,10 +2462,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         } else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
           // Negation preserves a recurrence's no self-wrap property.
           SmallVector<const SCEV *, 4> Operands;
-          for (SCEVAddRecExpr::op_iterator I = AddRec->op_begin(),
-                 E = AddRec->op_end(); I != E; ++I) {
-            Operands.push_back(getMulExpr(Ops[0], *I));
-          }
+          for (const SCEV *AddRecOp : AddRec->operands())
+            Operands.push_back(getMulExpr(Ops[0], AddRecOp));
+
           return getAddRecExpr(Operands, AddRec->getLoop(),
                                AddRec->getNoWrapFlags(SCEV::FlagNW));
         }
@@ -3289,7 +3283,8 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
 
-namespace {
+
+bool ScalarEvolution::checkValidity(const SCEV *S) const {
   // Helper class working with SCEVTraversal to figure out if a SCEV contains
   // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne
   // is set iff if find such SCEVUnknown.
@@ -3311,9 +3306,7 @@ namespace {
     }
     bool isDone() const { return FindOne; }
   };
-}
 
-bool ScalarEvolution::checkValidity(const SCEV *S) const {
   FindInvalidSCEVUnknown F;
   SCEVTraversal<FindInvalidSCEVUnknown> ST(F);
   ST.visitAll(S);
@@ -3555,13 +3548,12 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
     return getPointerBase(Cast->getOperand());
   } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
     const SCEV *PtrOp = nullptr;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      if ((*I)->getType()->isPointerTy()) {
+    for (const SCEV *NAryOp : NAry->operands()) {
+      if (NAryOp->getType()->isPointerTy()) {
         // Cannot find the base of an expression with multiple pointer operands.
         if (PtrOp)
           return V;
-        PtrOp = *I;
+        PtrOp = NAryOp;
       }
     }
     if (!PtrOp)
@@ -5832,12 +5824,10 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
   PHINode *PHI = nullptr;
-  for (Instruction::op_iterator OpI = UseInst->op_begin(),
-         OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
-
-    if (isa<Constant>(*OpI)) continue;
+  for (Value *Op : UseInst->operands()) {
+    if (isa<Constant>(Op)) continue;
 
-    Instruction *OpInst = dyn_cast<Instruction>(*OpI);
+    Instruction *OpInst = dyn_cast<Instruction>(Op);
     if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr;
 
     PHINode *P = dyn_cast<PHINode>(OpInst);
@@ -8721,30 +8711,28 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
   return true;
 }
 
-namespace {
-struct FindParameter {
-  bool FoundParameter;
-  FindParameter() : FoundParameter(false) {}
-
-  bool follow(const SCEV *S) {
-    if (isa<SCEVUnknown>(S)) {
-      FoundParameter = true;
-      // Stop recursion: we found a parameter.
-      return false;
-    }
-    // Keep looking.
-    return true;
-  }
-  bool isDone() const {
-    // Stop recursion if we have found a parameter.
-    return FoundParameter;
-  }
-};
-}
-
 // Returns true when S contains at least a SCEVUnknown parameter.
 static inline bool
 containsParameters(const SCEV *S) {
+  struct FindParameter {
+    bool FoundParameter;
+    FindParameter() : FoundParameter(false) {}
+
+    bool follow(const SCEV *S) {
+      if (isa<SCEVUnknown>(S)) {
+        FoundParameter = true;
+        // Stop recursion: we found a parameter.
+        return false;
+      }
+      // Keep looking.
+      return true;
+    }
+    bool isDone() const {
+      // Stop recursion if we have found a parameter.
+      return FoundParameter;
+    }
+  };
+
   FindParameter F;
   SCEVTraversal<FindParameter> ST(F);
   ST.visitAll(S);
@@ -9360,9 +9348,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
   case scSMaxExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
     bool Proper = true;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      BlockDisposition D = getBlockDisposition(*I, BB);
+    for (const SCEV *NAryOp : NAry->operands()) {
+      BlockDisposition D = getBlockDisposition(NAryOp, BB);
       if (D == DoesNotDominateBlock)
         return DoesNotDominateBlock;
       if (D == DominatesBlock)
@@ -9406,24 +9393,22 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
   return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
 }
 
-namespace {
-// Search for a SCEV expression node within an expression tree.
-// Implements SCEVTraversal::Visitor.
-struct SCEVSearch {
-  const SCEV *Node;
-  bool IsFound;
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  // Search for a SCEV expression node within an expression tree.
+  // Implements SCEVTraversal::Visitor.
+  struct SCEVSearch {
+    const SCEV *Node;
+    bool IsFound;
 
-  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+    SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
 
-  bool follow(const SCEV *S) {
-    IsFound |= (S == Node);
-    return !IsFound;
-  }
-  bool isDone() const { return IsFound; }
-};
-}
+    bool follow(const SCEV *S) {
+      IsFound |= (S == Node);
+      return !IsFound;
+    }
+    bool isDone() const { return IsFound; }
+  };
 
-bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
   SCEVSearch Search(Op);
   visitAll(S, Search);
   return Search.IsFound;
@@ -9722,3 +9707,46 @@ void SCEVUnionPredicate::add(const SCEVPredicate *N) {
   SCEVToPreds[Key].push_back(N);
   Preds.push_back(N);
 }
+
+PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
+    : SE(SE), Generation(0) {}
+
+const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
+  const SCEV *Expr = SE.getSCEV(V);
+  RewriteEntry &Entry = RewriteMap[Expr];
+
+  // If we already have an entry and the version matches, return it.
+  if (Entry.second && Generation == Entry.first)
+    return Entry.second;
+
+  // We found an entry but it's stale. Rewrite the stale entry
+  // acording to the current predicate.
+  if (Entry.second)
+    Expr = Entry.second;
+
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
+  Entry = {Generation, NewSCEV};
+
+  return NewSCEV;
+}
+
+void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
+  if (Preds.implies(&Pred))
+    return;
+  Preds.add(&Pred);
+  updateGeneration();
+}
+
+const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
+  return Preds;
+}
+
+void PredicatedScalarEvolution::updateGeneration() {
+  // If the generation number wrapped recompute everything.
+  if (++Generation == 0) {
+    for (auto &II : RewriteMap) {
+      const SCEV *Rewritten = II.second.second;
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
+    }
+  }
+}
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 8c5805e9d168..abfcfbafb32e 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -933,6 +933,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
       !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
     return false;
 
+  if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
+    return false;
+
   // Check that the chain of IV operands leading back to Phi can be hoisted.
   SmallVector<Instruction*, 4> IVIncs;
   for(;;) {
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 372c56588864..db90f78b3186 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -591,6 +591,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(ghccc);
   KEYWORD(hhvmcc);
   KEYWORD(hhvm_ccc);
+  KEYWORD(cxx_fast_tlscc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -777,6 +778,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   DWKEYWORD(VIRTUALITY, DwarfVirtuality);
   DWKEYWORD(LANG, DwarfLang);
   DWKEYWORD(OP, DwarfOp);
+  DWKEYWORD(MACINFO, DwarfMacinfo);
 #undef DWKEYWORD
 
   if (Keyword.startswith("DIFlag")) {
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 307ed397834c..b5cbee5085bd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1544,6 +1544,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'ghccc'
 ///   ::= 'hhvmcc'
 ///   ::= 'hhvm_ccc'
+///   ::= 'cxx_fast_tlscc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1574,6 +1575,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
   case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
+  case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
@@ -3277,6 +3279,11 @@ struct DwarfTagField : public MDUnsignedField {
   DwarfTagField(dwarf::Tag DefaultTag)
       : MDUnsignedField(DefaultTag, dwarf::DW_TAG_hi_user) {}
 };
+struct DwarfMacinfoTypeField : public MDUnsignedField {
+  DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {}
+  DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType)
+    : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {}
+};
 struct DwarfAttEncodingField : public MDUnsignedField {
   DwarfAttEncodingField() : MDUnsignedField(0, dwarf::DW_ATE_hi_user) {}
 };
@@ -3368,6 +3375,26 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
   return false;
 }
 
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            DwarfMacinfoTypeField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfMacinfo)
+    return TokError("expected DWARF macinfo type");
+
+  unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal());
+  if (Macinfo == dwarf::DW_MACINFO_invalid)
+    return TokError(
+        "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'");
+  assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type");
+
+  Result.assign(Macinfo);
+  Lex.Lex();
+  return false;
+}
+
 template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             DwarfVirtualityField &Result) {
@@ -3780,7 +3807,7 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
 ///                      splitDebugFilename: "abc.debug", emissionKind: 1,
 ///                      enums: !1, retainedTypes: !2, subprograms: !3,
-///                      globals: !4, imports: !5, dwoId: 0x0abcd)
+///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd)
 bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
     return Lex.Error("missing 'distinct', required for !DICompileUnit");
@@ -3799,6 +3826,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(subprograms, MDField, );                                            \
   OPTIONAL(globals, MDField, );                                                \
   OPTIONAL(imports, MDField, );                                                \
+  OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
@@ -3806,7 +3834,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   Result = DICompileUnit::getDistinct(
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
-      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, dwoId.Val);
+      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, macros.Val,
+      dwoId.Val);
   return false;
 }
 
@@ -3902,6 +3931,39 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
+/// ParseDIMacro:
+///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue")
+bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(type, DwarfMacinfoTypeField, );                                     \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(value, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacro,
+                           (Context, type.Val, line.Val, name.Val, value.Val));
+  return false;
+}
+
+/// ParseDIMacroFile:
+///   ::= !DIMacroFile(line: 9, file: !2, nodes: !3)
+bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(type, DwarfMacinfoTypeField, (dwarf::DW_MACINFO_start_file));       \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(file, MDField, );                                                   \
+  OPTIONAL(nodes, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacroFile,
+                           (Context, type.Val, line.Val, file.Val, nodes.Val));
+  return false;
+}
+
+
 /// ParseDIModule:
 ///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros: "-DNDEBUG",
 ///                 includePath: "/usr/include", isysroot: "/")
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 6a9a1de0b850..10c840d257f7 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -99,6 +99,7 @@ namespace lltok {
     kw_preserve_mostcc, kw_preserve_allcc,
     kw_ghccc,
     kw_hhvmcc, kw_hhvm_ccc,
+    kw_cxx_fast_tlscc,
 
     // Attributes:
     kw_attributes,
@@ -214,6 +215,7 @@ namespace lltok {
     DwarfLang,         // DW_LANG_foo
     DwarfOp,           // DW_OP_foo
     DIFlag,            // DIFlagFoo
+    DwarfMacinfo,      // DW_MACINFO_foo
 
     // Type valued tokens (TyVal).
     Type,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e95aba771b9c..4b5af3dd80fc 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2214,10 +2214,10 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       break;
     }
     case bitc::METADATA_COMPILE_UNIT: {
-      if (Record.size() < 14 || Record.size() > 15)
+      if (Record.size() < 14 || Record.size() > 16)
         return error("Invalid record");
 
-      // Ignore Record[1], which indicates whether this compile unit is
+      // Ignore Record[0], which indicates whether this compile unit is
       // distinct.  It's always distinct.
       MDValueList.assignValue(
           DICompileUnit::getDistinct(
@@ -2226,7 +2226,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
               Record[6], getMDString(Record[7]), Record[8],
               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
               getMDOrNull(Record[11]), getMDOrNull(Record[12]),
-              getMDOrNull(Record[13]), Record.size() == 14 ? 0 : Record[14]),
+              getMDOrNull(Record[13]),
+              Record.size() <= 15 ? 0 : getMDOrNull(Record[15]),
+              Record.size() <= 14 ? 0 : Record[14]),
           NextMDValueNo++);
       break;
     }
@@ -2294,6 +2296,28 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
           NextMDValueNo++);
       break;
     }
+    case bitc::METADATA_MACRO: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MDValueList.assignValue(
+          GET_OR_DISTINCT(DIMacro, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDString(Record[3]), getMDString(Record[4]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_MACRO_FILE: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MDValueList.assignValue(
+          GET_OR_DISTINCT(DIMacroFile, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
+          NextMDValueNo++);
+      break;
+    }
     case bitc::METADATA_TEMPLATE_TYPE: {
       if (Record.size() != 3)
         return error("Invalid record");
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index b1b699765bd4..201b4bc34c2b 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1017,6 +1017,7 @@ static void WriteDICompileUnit(const DICompileUnit *N,
   Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get()));
   Record.push_back(N->getDWOId());
+  Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -1092,6 +1093,33 @@ static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
+static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE,
+                         BitstreamWriter &Stream,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawValue()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE,
+                             BitstreamWriter &Stream,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO_FILE, Record, Abbrev);
+  Record.clear();
+}
+
 static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE,
                           BitstreamWriter &Stream,
                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 86b4f057677f..e5d53149c50e 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1187,6 +1187,11 @@ bool AsmPrinter::doFinalization(Module &M) {
     else
       assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
+    // Set the symbol type to function if the alias has a function type.
+    // This affects codegen when the aliasee is not a function.
+    if (Alias.getType()->getPointerElementType()->isFunctionTy())
+      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+
     EmitVisibility(Name, Alias.getVisibility());
 
     // Emit the directives as assignments aka .set:
@@ -1868,6 +1873,8 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
                                    const Constant *BaseCV = nullptr,
                                    uint64_t Offset = 0);
 
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1945,34 +1952,9 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
       AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i),
                                    ElementByteSize);
     }
-  } else if (ElementByteSize == 4) {
-    // FP Constants are printed as integer constants to avoid losing
-    // precision.
-    assert(CDS->getElementType()->isFloatTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        float F;
-        uint32_t I;
-      };
-
-      F = CDS->getElementAsFloat(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "float " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 4);
-    }
   } else {
-    assert(CDS->getElementType()->isDoubleTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        double F;
-        uint64_t I;
-      };
-
-      F = CDS->getElementAsDouble(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "double " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 8);
-    }
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
+      emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP);
   }
 
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 54d92ad67a97..e41926a819c2 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -371,7 +371,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   }
   // Back past possible debugging pseudos at beginning of block.  This matters
   // when one block differs from the other only by whether debugging pseudos
-  // are present at the beginning.  (This way, the various checks later for
+  // are present at the beginning. (This way, the various checks later for
   // I1==MBB1->begin() work as expected.)
   if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
     --I2;
@@ -1564,6 +1564,14 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
       // removed, move this block to the end of the function.
       MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
+      // We're looking for cases where PrevBB could possibly fall through to
+      // FallThrough, but if FallThrough is an EH pad that wouldn't be useful
+      // so here we skip over any EH pads so we might have a chance to find
+      // a branch target from PrevBB.
+      while (FallThrough != MF.end() && FallThrough->isEHPad())
+        ++FallThrough;
+      // Now check to see if the current block is sitting between PrevBB and
+      // a block to which it could fall through.
       if (FallThrough != MF.end() &&
           !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
           PrevBB.isSuccessor(&*FallThrough)) {
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index d619044e86ca..30af1c54fac9 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -5251,6 +5251,10 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
 
       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        // If VI is a phi in a block with an EHPad terminator, we can't insert
+        // after it.
+        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
+          continue;
         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
         DVI->removeFromParent();
         if (isa<PHINode>(VI))
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index e90cb02bd280..71bd61a15cb7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1679,15 +1679,6 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
         BranchProbability::getZero());
   }
 
-  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
-    // edge probability being merged to other edges when this edge is removed
-    // later.
-    ToBBI.BB->setSuccProbability(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
-        BranchProbability::getZero());
-  }
-
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
     MachineBasicBlock *Succ = FromSuccs[i];
     // Fallthrough edge can't be transferred.
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c9c2d62cec30..f2f6584fb6c8 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -745,11 +745,11 @@ bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
   // Gather all the expected implicit operands.
   SmallVector<MachineOperand, 4> ImplicitOperands;
   if (MCID.ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID.ImplicitUses)
-    for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpUses, false, true));
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e202810bf6e5..1eb2edcd7cec 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -631,10 +631,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
 
 void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
   if (MCID->ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs;
+           ++ImpDefs)
       addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID->ImplicitUses)
-    for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID->getImplicitUses(); *ImpUses;
+           ++ImpUses)
       addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
 }
 
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index fa59b0c6aca2..1a8e92332bc1 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -153,7 +153,7 @@ namespace {
     }
 
   private:
-    /// CandidateInfo - Keep track of information about hoisting candidates.
+    /// Keep track of information about hoisting candidates.
     struct CandidateInfo {
       MachineInstr *MI;
       unsigned      Def;
@@ -162,149 +162,76 @@ namespace {
         : MI(mi), Def(def), FI(fi) {}
     };
 
-    /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-    /// invariants out to the preheader.
     void HoistRegionPostRA();
 
-    /// HoistPostRA - When an instruction is found to only use loop invariant
-    /// operands that is safe to hoist, this instruction is called to do the
-    /// dirty work.
     void HoistPostRA(MachineInstr *MI, unsigned Def);
 
-    /// ProcessMI - Examine the instruction for potentai LICM candidate. Also
-    /// gather register def and frame object update information.
-    void ProcessMI(MachineInstr *MI,
-                   BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers,
-                   SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates);
 
-    /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
-    /// current loop.
     void AddToLiveIns(unsigned Reg);
 
-    /// IsLICMCandidate - Returns true if the instruction may be a suitable
-    /// candidate for LICM. e.g. If the instruction is a call, then it's
-    /// obviously not safe to hoist it.
     bool IsLICMCandidate(MachineInstr &I);
 
-    /// IsLoopInvariantInst - Returns true if the instruction is loop
-    /// invariant. I.e., all virtual register operands are defined outside of
-    /// the loop, physical registers aren't accessed (explicitly or implicitly),
-    /// and the instruction is hoistable.
-    ///
     bool IsLoopInvariantInst(MachineInstr &I);
 
-    /// HasLoopPHIUse - Return true if the specified instruction is used by any
-    /// phi node in the current loop.
     bool HasLoopPHIUse(const MachineInstr *MI) const;
 
-    /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-    /// and an use in the current loop, return true if the target considered
-    /// it 'high'.
     bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
                                unsigned Reg) const;
 
     bool IsCheapInstruction(MachineInstr &MI) const;
 
-    /// CanCauseHighRegPressure - Visit BBs from header to current BB,
-    /// check if hoisting an instruction of the given cost matrix can cause high
-    /// register pressure.
     bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
                                  bool Cheap);
 
-    /// UpdateBackTraceRegPressure - Traverse the back trace from header to
-    /// the current block and update their register pressures to reflect the
-    /// effect of hoisting MI from the current block to the preheader.
     void UpdateBackTraceRegPressure(const MachineInstr *MI);
 
-    /// IsProfitableToHoist - Return true if it is potentially profitable to
-    /// hoist the given loop invariant.
     bool IsProfitableToHoist(MachineInstr &MI);
 
-    /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-    /// If not then a load from this mbb may not be safe to hoist.
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
     void EnterScope(MachineBasicBlock *MBB);
 
     void ExitScope(MachineBasicBlock *MBB);
 
-    /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to given
-    /// dominator tree node if its a leaf or all of its children are done. Walk
-    /// up the dominator tree to destroy ancestors which are now done.
-    void ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
-
-    /// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-    /// blocks dominated by the specified header block, and that are in the
-    /// current loop) in depth first order w.r.t the DominatorTree. This allows
-    /// us to visit definitions before uses, allowing us to hoist a loop body in
-    /// one pass without iteration.
-    ///
+    void ExitScopeIfDone(
+        MachineDomTreeNode *Node,
+        DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
+        DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+
     void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
 
-    /// SinkIntoLoop - Sink instructions into loops if profitable. This
-    /// especially tries to prevent register spills caused by register pressure
-    /// if there is little to no overhead moving instructions into loops.
     void SinkIntoLoop();
 
-    /// InitRegPressure - Find all virtual register references that are liveout
-    /// of the preheader to initialize the starting "register pressure". Note
-    /// this does not count live through (livein but not used) registers.
     void InitRegPressure(MachineBasicBlock *BB);
 
-    /// calcRegisterCost - Calculate the additional register pressure that the
-    /// registers used in MI cause.
-    ///
-    /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
-    /// figure out which usages are live-ins.
-    /// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
     DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
                                              bool ConsiderSeen,
                                              bool ConsiderUnseenAsDef);
 
-    /// UpdateRegPressure - Update estimate of register pressure after the
-    /// specified instruction.
     void UpdateRegPressure(const MachineInstr *MI,
                            bool ConsiderUnseenAsDef = false);
 
-    /// ExtractHoistableLoad - Unfold a load from the given machineinstr if
-    /// the load itself could be hoisted. Return the unfolded and hoistable
-    /// load, or null if the load couldn't be unfolded or if it wouldn't
-    /// be hoistable.
     MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
 
-    /// LookForDuplicate - Find an instruction amount PrevMIs that is a
-    /// duplicate of MI. Return this instruction if it's found.
-    const MachineInstr *LookForDuplicate(const MachineInstr *MI,
-                                     std::vector<const MachineInstr*> &PrevMIs);
+    const MachineInstr *
+    LookForDuplicate(const MachineInstr *MI,
+                     std::vector<const MachineInstr *> &PrevMIs);
 
-    /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on
-    /// the preheader that compute the same value. If it's found, do a RAU on
-    /// with the definition of the existing instruction rather than hoisting
-    /// the instruction to the preheader.
-    bool EliminateCSE(MachineInstr *MI,
-           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
+    bool EliminateCSE(
+        MachineInstr *MI,
+        DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI);
 
-    /// MayCSE - Return true if the given instruction will be CSE'd if it's
-    /// hoisted out of the loop.
     bool MayCSE(MachineInstr *MI);
 
-    /// Hoist - When an instruction is found to only use loop invariant operands
-    /// that is safe to hoist, this instruction is called to do the dirty work.
-    /// It returns true if the instruction is hoisted.
     bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
 
-    /// InitCSEMap - Initialize the CSE map with instructions that are in the
-    /// current loop preheader that may become duplicates of instructions that
-    /// are hoisted out of the loop.
     void InitCSEMap(MachineBasicBlock *BB);
 
-    /// getCurPreheader - Get the preheader for the current loop, splitting
-    /// a critical edge if needed.
     MachineBasicBlock *getCurPreheader();
   };
 } // end anonymous namespace
@@ -319,8 +246,7 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineLICM, "machinelicm",
                 "Machine Loop Invariant Code Motion", false, false)
 
-/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
-/// loop that has a unique predecessor.
+/// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
   // Check whether this loop even has a unique predecessor.
   if (!CurLoop->getLoopPredecessor())
@@ -402,8 +328,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-/// InstructionStoresToFI - Return true if instruction stores to the
-/// specified frame.
+/// Return true if instruction stores to the specified frame.
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
          oe = MI->memoperands_end(); o != oe; ++o) {
@@ -418,7 +343,7 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
-/// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+/// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICM::ProcessMI(MachineInstr *MI,
                             BitVector &PhysRegDefs,
@@ -506,8 +431,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
   }
 }
 
-/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-/// invariants out to the preheader.
+/// Walk the specified region of the CFG and hoist loop invariants out to the
+/// preheader.
 void MachineLICM::HoistRegionPostRA() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -599,8 +524,8 @@ void MachineLICM::HoistRegionPostRA() {
   }
 }
 
-/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
-/// loop, and make sure it is not killed by any instructions in the loop.
+/// Add register 'Reg' to the livein sets of BBs in the current loop, and make
+/// sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
   const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -620,9 +545,8 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
   }
 }
 
-/// HoistPostRA - When an instruction is found to only use loop invariant
-/// operands that is safe to hoist, this instruction is called to do the
-/// dirty work.
+/// When an instruction is found to only use loop invariant operands that is
+/// safe to hoist, this instruction is called to do the dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *Preheader = getCurPreheader();
 
@@ -644,8 +568,8 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   Changed = true;
 }
 
-// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-// If not then a load from this mbb may not be safe to hoist.
+/// Check if this mbb is guaranteed to execute. If not then a load from this mbb
+/// may not be safe to hoist.
 bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   if (SpeculationState != SpeculateUnknown)
     return SpeculationState == SpeculateFalse;
@@ -677,9 +601,9 @@ void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
   BackTrace.pop_back();
 }
 
-/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
-/// dominator tree node if its a leaf or all of its children are done. Walk
-/// up the dominator tree to destroy ancestors which are now done.
+/// Destroy scope for the MBB that corresponds to the given dominator tree node
+/// if its a leaf or all of its children are done. Walk up the dominator tree to
+/// destroy ancestors which are now done.
 void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
@@ -699,11 +623,10 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
   }
 }
 
-/// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-/// blocks dominated by the specified header block, and that are in the
-/// current loop) in depth first order w.r.t the DominatorTree. This allows
-/// us to visit definitions before uses, allowing us to hoist a loop body in
-/// one pass without iteration.
+/// Walk the specified loop in the CFG (defined by all blocks dominated by the
+/// specified header block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
 void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   MachineBasicBlock *Preheader = getCurPreheader();
@@ -784,6 +707,9 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   }
 }
 
+/// Sink instructions into loops if profitable. This especially tries to prevent
+/// register spills caused by register pressure if there is little to no
+/// overhead moving instructions into loops.
 void MachineLICM::SinkIntoLoop() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -835,9 +761,9 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
 }
 
-/// InitRegPressure - Find all virtual register references that are liveout of
-/// the preheader to initialize the starting "register pressure". Note this
-/// does not count live through (livein but not used) registers.
+/// Find all virtual register references that are liveout of the preheader to
+/// initialize the starting "register pressure". Note this does not count live
+/// through (livein but not used) registers.
 void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
 
@@ -856,8 +782,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
     UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true);
 }
 
-/// UpdateRegPressure - Update estimate of register pressure after the
-/// specified instruction.
+/// Update estimate of register pressure after the specified instruction.
 void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
                                     bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
@@ -870,6 +795,12 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
   }
 }
 
+/// Calculate the additional register pressure that the registers used in MI
+/// cause.
+///
+/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
+/// figure out which usages are live-ins.
+/// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
 DenseMap<unsigned, int>
 MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
                               bool ConsiderUnseenAsDef) {
@@ -913,8 +844,8 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
   return Cost;
 }
 
-/// isLoadFromGOTOrConstantPool - Return true if this machine instruction
-/// loads from global offset table or constant pool.
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
 static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
   assert (MI.mayLoad() && "Expected MI that loads!");
   for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
@@ -927,9 +858,8 @@ static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
   return false;
 }
 
-/// IsLICMCandidate - Returns true if the instruction may be a suitable
-/// candidate for LICM. e.g. If the instruction is a call, then it's obviously
-/// not safe to hoist it.
+/// Returns true if the instruction may be a suitable candidate for LICM.
+/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
 bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = true;
@@ -949,9 +879,9 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   return true;
 }
 
-/// IsLoopInvariantInst - Returns true if the instruction is loop
-/// invariant. I.e., all virtual register operands are defined outside of the
-/// loop, physical registers aren't accessed explicitly, and there are no side
+/// Returns true if the instruction is loop invariant.
+/// I.e., all virtual register operands are defined outside of the loop,
+/// physical registers aren't accessed explicitly, and there are no side
 /// effects that aren't captured by the operands or other flags.
 ///
 bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
@@ -1005,8 +935,8 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 }
 
 
-/// HasLoopPHIUse - Return true if the specified instruction is used by a
-/// phi node and hoisting it could cause a copy to be inserted.
+/// Return true if the specified instruction is used by a phi node and hoisting
+/// it could cause a copy to be inserted.
 bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   SmallVector<const MachineInstr*, 8> Work(1, MI);
   do {
@@ -1040,9 +970,8 @@ bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   return false;
 }
 
-/// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-/// and an use in the current loop, return true if the target considered
-/// it 'high'.
+/// Compute operand latency between a def of 'Reg' and an use in the current
+/// loop, return true if the target considered it high.
 bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
                                         unsigned DefIdx, unsigned Reg) const {
   if (MRI->use_nodbg_empty(Reg))
@@ -1072,8 +1001,8 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
   return false;
 }
 
-/// IsCheapInstruction - Return true if the instruction is marked "cheap" or
-/// the operand latency between its def and a use is one or less.
+/// Return true if the instruction is marked "cheap" or the operand latency
+/// between its def and a use is one or less.
 bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike())
     return true;
@@ -1097,9 +1026,8 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   return isCheap;
 }
 
-/// CanCauseHighRegPressure - Visit BBs from header to current BB, check
-/// if hoisting an instruction of the given cost matrix can cause high
-/// register pressure.
+/// Visit BBs from header to current BB, check if hoisting an instruction of the
+/// given cost matrix can cause high register pressure.
 bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
                                           bool CheapInstr) {
   for (const auto &RPIdAndCost : Cost) {
@@ -1122,9 +1050,9 @@ bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
   return false;
 }
 
-/// UpdateBackTraceRegPressure - Traverse the back trace from header to the
-/// current block and update their register pressures to reflect the effect
-/// of hoisting MI from the current block to the preheader.
+/// Traverse the back trace from header to the current block and update their
+/// register pressures to reflect the effect of hoisting MI from the current
+/// block to the preheader.
 void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
   // First compute the 'cost' of the instruction, i.e. its contribution
   // to register pressure.
@@ -1137,8 +1065,8 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
       RP[RPIdAndCost.first] += RPIdAndCost.second;
 }
 
-/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
-/// the given loop invariant.
+/// Return true if it is potentially profitable to hoist the given loop
+/// invariant.
 bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   if (MI.isImplicitDef())
     return true;
@@ -1228,6 +1156,9 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   return true;
 }
 
+/// Unfold a load from the given machineinstr if the load itself could be
+/// hoisted. Return the unfolded and hoistable load, or null if the load
+/// couldn't be unfolded or if it wouldn't be hoistable.
 MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   // Don't unfold simple loads.
   if (MI->canFoldAsLoad())
@@ -1285,6 +1216,9 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   return NewMIs[0];
 }
 
+/// Initialize the CSE map with instructions that are in the current loop
+/// preheader that may become duplicates of instructions that are hoisted
+/// out of the loop.
 void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
   for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
@@ -1293,6 +1227,8 @@ void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
   }
 }
 
+/// Find an instruction amount PrevMIs that is a duplicate of MI.
+/// Return this instruction if it's found.
 const MachineInstr*
 MachineLICM::LookForDuplicate(const MachineInstr *MI,
                               std::vector<const MachineInstr*> &PrevMIs) {
@@ -1304,6 +1240,10 @@ MachineLICM::LookForDuplicate(const MachineInstr *MI,
   return nullptr;
 }
 
+/// Given a LICM'ed instruction, look for an instruction on the preheader that
+/// computes the same value. If it's found, do a RAU on with the definition of
+/// the existing instruction rather than hoisting the instruction to the
+/// preheader.
 bool MachineLICM::EliminateCSE(MachineInstr *MI,
           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
@@ -1361,8 +1301,8 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
   return false;
 }
 
-/// MayCSE - Return true if the given instruction will be CSE'd if it's
-/// hoisted out of the loop.
+/// Return true if the given instruction will be CSE'd if it's hoisted out of
+/// the loop.
 bool MachineLICM::MayCSE(MachineInstr *MI) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
@@ -1375,9 +1315,9 @@ bool MachineLICM::MayCSE(MachineInstr *MI) {
   return LookForDuplicate(MI, CI->second) != nullptr;
 }
 
-/// Hoist - When an instruction is found to use only loop invariant operands
+/// When an instruction is found to use only loop invariant operands
 /// that are safe to hoist, this instruction is called to do the dirty work.
-///
+/// It returns true if the instruction is hoisted.
 bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
@@ -1439,6 +1379,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   return true;
 }
 
+/// Get the preheader for the current loop, splitting a critical edge if needed.
 MachineBasicBlock *MachineLICM::getCurPreheader() {
   // Determine the block to which to hoist instructions. If we can't find a
   // suitable loop predecessor, we can't do any hoisting.
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index d9a6b68462eb..f7edacd5ebaf 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -724,13 +724,12 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
 
   // Update RegUnits to reflect live registers after UseMI.
   // First kills.
-  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
-    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+  for (unsigned Kill : Kills)
+    for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units)
       RegUnits.erase(*Units);
 
   // Second, live defs.
-  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
-    unsigned DefOp = LiveDefOps[i];
+  for (unsigned DefOp : LiveDefOps) {
     for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
          Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
@@ -756,8 +755,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrDepths && "Missing depth info");
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
-  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
-    const LiveInReg &LIR = TBI.LiveIns[i];
+  for (const LiveInReg &LIR : TBI.LiveIns) {
     if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 9d473fdb6cd7..4d7adcbbfe77 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -96,10 +96,10 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
-// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it
-// wouldn't be part of the standard pass pipeline, and the target would just add
-// a PostRA scheduling pass wherever it wants.
-static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
+// Targets can return true in targetSchedulesPostRAScheduling() and
+// insert a PostRA scheduling pass wherever it wants.
+cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
@@ -575,7 +575,10 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&ImplicitNullChecksID);
 
   // Second pass scheduler.
-  if (getOptLevel() != CodeGenOpt::None) {
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM->targetSchedulesPostRAScheduling()) {
     if (MISchedPostRA)
       addPass(&PostMachineSchedulerID);
     else
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 4fd1c4bda433..f861edf7da25 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -98,6 +98,10 @@ static cl::opt<bool>
 DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                   cl::desc("Disable advanced copy optimization"));
 
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
 // Limit the number of PHI instructions to process
 // in PeepholeOptimizer::getNextSource.
 static cl::opt<unsigned> RewritePHILimit(
@@ -111,6 +115,7 @@ STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
 STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
 STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 
 namespace {
   class ValueTrackerResult;
@@ -162,12 +167,24 @@ namespace {
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
-    /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was
+    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
     /// previously seen as a copy, replace the uses of this copy with the
     /// previously seen copy's destination register.
     bool foldRedundantCopy(MachineInstr *MI,
-                           SmallSet<unsigned, 4> &CopiedFromRegs,
-                           DenseMap<unsigned, MachineInstr*> &CopyMIs);
+                           SmallSet<unsigned, 4> &CopySrcRegs,
+                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+    /// \brief Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(unsigned Reg);
+
+    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr *MI,
+        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
 
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
@@ -1326,13 +1343,16 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isDef())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     if (ImmDefRegs.count(Reg) == 0)
       continue;
     DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
-    assert(II != ImmDefMIs.end());
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
     if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
       ++NumImmFold;
       return true;
@@ -1356,10 +1376,10 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
 //
 // Should replace %vreg2 uses with %vreg1:sub1
 bool PeepholeOptimizer::foldRedundantCopy(
-  MachineInstr *MI,
-  SmallSet<unsigned, 4> &CopySrcRegs,
-  DenseMap<unsigned, MachineInstr *> &CopyMIs) {
-  assert(MI->isCopy());
+    MachineInstr *MI,
+    SmallSet<unsigned, 4> &CopySrcRegs,
+    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
 
   unsigned SrcReg = MI->getOperand(1).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
@@ -1400,6 +1420,59 @@ bool PeepholeOptimizer::foldRedundantCopy(
   return true;
 }
 
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // %vreg = COPY %PHYSREG
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, MI});
+    return false;
+  }
+
+  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+    return false;
+
+  // %PHYSREG = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+                 << '\n');
+    return false;
+  }
+
+  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
@@ -1433,6 +1506,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
 
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+    // without any intervening re-definition of %PHYSREG.
+    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
     // Set of virtual registers that are copied from.
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
@@ -1453,11 +1533,52 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isLoadFoldBarrier())
         FoldAsLoadDefCandidates.clear();
 
-      if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
-          MI->isKill() || MI->isInlineAsm() ||
-          MI->hasUnmodeledSideEffects())
+      if (MI->isPosition() || MI->isPHI())
         continue;
 
+      if (!MI->isCopy()) {
+        for (const auto &Op : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (Op.isReg()) {
+            unsigned Reg = Op.getReg();
+            if (Op.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (Op.isRegMask()) {
+            const uint32_t *RegMask = Op.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              unsigned Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+                     << '\n');
+        NAPhysToVirtMIs.clear();
+        continue;
+      }
+
       if ((isUncoalescableCopy(*MI) &&
            optimizeUncoalescableCopy(MI, LocalMIs)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
@@ -1479,7 +1600,9 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) {
+      if (MI->isCopy() &&
+          (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+           foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
         LocalMIs.erase(MI);
         MI->eraseFromParent();
         Changed = true;
@@ -1583,6 +1706,9 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
     const MachineOperand &MO = Def->getOperand(OpIdx);
     if (!MO.isReg() || !MO.getReg())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     assert(!MO.isDef() && "We should have skipped all the definitions by now");
     if (SrcIdx != EndOpIdx)
       // Multiple sources?
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index afa98b26d727..fd28b05ed80a 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -805,33 +806,17 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-namespace {
-// A helper class for printing node and register info in a consistent way
-class PrintNodeInfo {
-public:
-  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
-  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
-
-  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
-
-  void print(raw_ostream &OS) const {
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return Printable([NId, &G](raw_ostream &OS) {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     unsigned VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
-  }
-
-private:
-  const Graph &G;
-  NodeId NId;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
-  PR.print(OS);
-  return OS;
+  });
 }
-} // anonymous namespace
 
 void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 6e7feb5178ee..8382b0912bde 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -171,10 +171,10 @@ void LiveRegSet::clear() {
   Regs.clear();
 }
 
-const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
+static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return &LIS->getInterval(Reg);
-  return LIS->getCachedRegUnit(Reg);
+    return &LIS.getInterval(Reg);
+  return LIS.getCachedRegUnit(Reg);
 }
 
 void RegPressureTracker::reset() {
@@ -323,6 +323,10 @@ class RegisterOperands {
 
   void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI,
                const MachineRegisterInfo &MRI, bool IgnoreDead = false);
+
+  /// Use liveness information to find dead defs not marked with a dead flag
+  /// and move them to the DeadDefs vector.
+  void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS);
 };
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
@@ -393,6 +397,27 @@ void RegisterOperands::collect(const MachineInstr &MI,
   Collector.collectInstr(MI);
 }
 
+void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
+                                      const LiveIntervals &LIS) {
+  SlotIndex SlotIdx = LIS.getInstructionIndex(&MI);
+  for (SmallVectorImpl<unsigned>::iterator RI = Defs.begin();
+       RI != Defs.end(); /*empty*/) {
+    unsigned Reg = *RI;
+    const LiveRange *LR = getLiveRange(LIS, Reg);
+    if (LR != nullptr) {
+      LiveQueryResult LRQ = LR->Query(SlotIdx);
+      if (LRQ.isDeadDef()) {
+        // LiveIntervals knows this is a dead even though it's MachineOperand is
+        // not flagged as such.
+        DeadDefs.push_back(Reg);
+        RI = Defs.erase(RI);
+        continue;
+      }
+    }
+    ++RI;
+  }
+}
+
 } // namespace
 
 /// Initialize an array of N PressureDiffs.
@@ -514,8 +539,11 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   if (RequireIntervals && isTopClosed())
     static_cast<IntervalPressure&>(P).openTop(SlotIdx);
 
+  const MachineInstr &MI = *CurrPos;
   RegisterOperands RegOpers;
-  RegOpers.collect(*CurrPos, *TRI, *MRI);
+  RegOpers.collect(MI, *TRI, *MRI);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(MI, *LIS);
 
   if (PDiff)
     collectPDiff(*PDiff, RegOpers, MRI);
@@ -527,26 +555,10 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
   for (unsigned Reg : RegOpers.Defs) {
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (DeadDef) {
-      // LiveIntervals knows this is a dead even though it's MachineOperand is
-      // not flagged as such. Since this register will not be recorded as
-      // live-out, increase its PDiff value to avoid underflowing pressure.
-      if (PDiff)
-        PDiff->addPressureChange(Reg, false, MRI);
-    } else {
-      if (LiveRegs.erase(Reg))
-        decreaseRegPressure(Reg);
-      else
-        discoverLiveOut(Reg);
-    }
+    if (LiveRegs.erase(Reg))
+      decreaseRegPressure(Reg);
+    else
+      discoverLiveOut(Reg);
   }
 
   // Generate liveness for uses.
@@ -554,7 +566,7 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveRange *LR = getLiveRange(Reg);
+        const LiveRange *LR = getLiveRange(*LIS, Reg);
         if (LR) {
           LiveQueryResult LRQ = LR->Query(SlotIdx);
           if (!LRQ.isKill() && !LRQ.valueDefined())
@@ -606,7 +618,7 @@ void RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       lastUse = LR && LR->Query(SlotIdx).isKill();
     } else {
       // Allocatable physregs are always single-use before register rewriting.
@@ -726,22 +738,13 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.size() == 0);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(*MI, *LIS);
 
   // Kill liveness at live defs.
   for (unsigned Reg : RegOpers.Defs) {
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (!DeadDef) {
-      if (!containsReg(RegOpers.Uses, Reg))
-        decreaseRegPressure(Reg);
-    }
+    if (!containsReg(RegOpers.Uses, Reg))
+      decreaseRegPressure(Reg);
   }
   // Generate liveness for uses.
   for (unsigned Reg : RegOpers.Uses) {
@@ -926,7 +929,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       if (LR) {
         LiveQueryResult LRQ = LR->Query(SlotIdx);
         if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 12b2beb357b4..fb82ab7a5555 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -51,11 +51,10 @@ static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
-                                     LiveIntervals *LIS,
                                      bool RemoveKillFlags)
-    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      FirstDbgValue(nullptr) {
+      TrackLaneMasks(false), FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -363,6 +362,20 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  unsigned Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return ~0u;
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
 /// to instructions that occur later in the same scheduling region if they read
 /// from or write to the virtual register defined at OperIdx.
@@ -370,35 +383,106 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
-  const MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? ~0u : DefLaneMask;
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = ~0u;
+    KillLaneMask = ~0u;
+  }
+
+  if (MO.isDead()) {
+    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
+           "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask) == 0) {
+        ++I;
+        continue;
+      }
 
-  // Singly defined vregs do not have output/anti dependencies.
-  // The current operand is a def, so we have at least one.
-  // Check here if there are any others...
+      if ((LaneMask & DefLaneMask) != 0) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask != 0) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
   if (MRI.hasOneDef(Reg))
     return;
 
-  // Add output dependence to the next nearest def of this vreg.
+  // Add output dependence to the next nearest defs of this vreg.
   //
   // Unless this definition is dead, the output dependence should be
   // transitively redundant with antidependencies from this definition's
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI == VRegDefs.end())
-    VRegDefs.insert(VReg2SUnit(Reg, SU));
-  else {
-    SUnit *DefSU = DefI->SU;
-    if (DefSU != SU && DefSU != &ExitSU) {
-      SDep Dep(SU, SDep::Output, Reg);
-      Dep.setLatency(
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-      DefSU->addPred(Dep);
-    }
-    DefI->SU = SU;
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask) == 0)
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
   }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask != 0)
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
 /// addVRegUseDeps - Add a register data dependency if the instruction that
@@ -408,49 +492,26 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
-  MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask) == 0)
+      continue;
+    if (V2SU.SU == SU)
+      continue;
 
-  // Record this local VReg use.
-  VReg2UseMap::iterator UI = VRegUses.find(Reg);
-  for (; UI != VRegUses.end(); ++UI) {
-    if (UI->SU == SU)
-      break;
-  }
-  if (UI == VRegUses.end())
-    VRegUses.insert(VReg2SUnit(Reg, SU));
-
-  // Lookup this operand's reaching definition.
-  assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveQueryResult LRQ
-    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
-  VNInfo *VNI = LRQ.valueIn();
-
-  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
-  assert(VNI && "No value to read by operand");
-  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
-  // Phis and other noninstructions (after coalescing) have a NULL Def.
-  if (Def) {
-    SUnit *DefSU = getSUnit(Def);
-    if (DefSU) {
-      // The reaching Def lives within this scheduling region.
-      // Create a data dependence.
-      SDep dep(DefSU, SDep::Data, Reg);
-      // Adjust the dependence latency using operand def/use information, then
-      // allow the target to perform its own adjustments.
-      int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
-
-      const TargetSubtargetInfo &ST = MF.getSubtarget();
-      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
-      SU->addPred(dep);
-    }
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
   }
-
-  // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
@@ -733,17 +794,44 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
+void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (TrackLaneMasks && !MO.isUse())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
+  }
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
-                                        PressureDiffs *PDiffs) {
+                                        PressureDiffs *PDiffs,
+                                        bool TrackLaneMasks) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
+  this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
 
@@ -777,10 +865,14 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Defs.setUniverse(TRI->getNumRegs());
   Uses.setUniverse(TRI->getNumRegs());
 
-  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
   VRegUses.clear();
-  VRegDefs.setUniverse(MRI.getNumVirtRegs());
-  VRegUses.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(NumVirtRegs);
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -808,6 +900,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
       assert(RPTracker->getPos() == std::prev(MII) &&
              "RPTracker can't find MI");
+      collectVRegUses(SU);
     }
 
     assert(
@@ -1057,7 +1150,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   Defs.clear();
   Uses.clear();
-  VRegDefs.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
   PendingLoads.clear();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2941a7eb3aed..0872d7a9a228 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3888,10 +3888,10 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
 
   // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
   if (Neg.getOpcode() != ISD::SUB)
-    return 0;
+    return false;
   ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
   if (!NegC)
-    return 0;
+    return false;
   SDValue NegOp1 = Neg.getOperand(1);
 
   // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
@@ -7244,6 +7244,12 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   return SDValue();
 }
 
+static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
+  // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
+  // and Lo parts; on big-endian machines it doesn't.
+  return DAG.getDataLayout().isBigEndian() ? 1 : 0;
+}
+
 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7310,6 +7316,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fneg x)) ->
+  //     flipbit = signbit
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  // fold (bitcast (fabs x)) ->
+  //     flipbit = (and (extract_element (bitcast x), 0), signbit)
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
@@ -7320,6 +7334,29 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     AddToWorklist(NewConv.getNode());
 
     SDLoc DL(N);
+    if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+      assert(VT.getSizeInBits() == 128);
+      SDValue SignBit = DAG.getConstant(
+          APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+      SDValue FlipBit;
+      if (N0.getOpcode() == ISD::FNEG) {
+        FlipBit = SignBit;
+        AddToWorklist(FlipBit.getNode());
+      } else {
+        assert(N0.getOpcode() == ISD::FABS);
+        SDValue Hi =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
+                        DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                              SDLoc(NewConv)));
+        AddToWorklist(Hi.getNode());
+        FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
+        AddToWorklist(FlipBit.getNode());
+      }
+      SDValue FlipBits =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+      AddToWorklist(FlipBits.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
+    }
     APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT,
@@ -7333,6 +7370,13 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //         (or (and (bitconvert x), sign), (and cst, (not sign)))
   // Note that we don't handle (copysign x, cst) because this can always be
   // folded to an fneg or fabs.
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fcopysign cst, x)) ->
+  //     flipbit = (and (extract_element
+  //                     (xor (bitcast cst), (bitcast x)), 0),
+  //                    signbit)
+  //     (xor (bitcast cst) (build_pair flipbit, flipbit))
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(0)) &&
       VT.isInteger() && !VT.isVector()) {
@@ -7361,6 +7405,30 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         AddToWorklist(X.getNode());
       }
 
+      if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+        APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
+        SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(0)), VT,
+                                  N0.getOperand(0));
+        AddToWorklist(Cst.getNode());
+        SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(1)), VT,
+                                N0.getOperand(1));
+        AddToWorklist(X.getNode());
+        SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
+        AddToWorklist(XorResult.getNode());
+        SDValue XorResult64 = DAG.getNode(
+            ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
+            DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                  SDLoc(XorResult)));
+        AddToWorklist(XorResult64.getNode());
+        SDValue FlipBit =
+            DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
+                        DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
+        AddToWorklist(FlipBit.getNode());
+        SDValue FlipBits =
+            DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+        AddToWorklist(FlipBits.getNode());
+        return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
+      }
       APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
@@ -8722,6 +8790,22 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
                      ZeroCmp, Zero, RV);
 }
 
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  // Do not optimize out type conversion of f128 type yet.
+  // For some target like x86_64, configuration is changed
+  // to keep one f128 value in one SSE register, but
+  // instruction selection cannot handle FCOPYSIGN on
+  // SSE registers yet.
+  SDValue N1 = N->getOperand(1);
+  EVT N1VT = N1->getValueType(0);
+  EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+  return (N1.getOpcode() == ISD::FP_EXTEND ||
+          N1.getOpcode() == ISD::FP_ROUND) &&
+         (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8765,7 +8849,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
-  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(0));
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 5ec10308dc28..a1e2d410ab00 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC);
+                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8238cdeb59ca..f46767f6c4a1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -154,6 +154,7 @@ class SelectionDAGLegalize {
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
 
   // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
   bool ExpandNode(SDNode *Node);
@@ -294,6 +295,20 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   return Result;
 }
 
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Result =
+    DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                false, false, false, Alignment);
+  return Result;
+}
+
 /// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
@@ -1192,15 +1207,17 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-             TargetLowering::TypeLegal &&
+    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Node->getValueType(i))) &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
-    assert((TLI.getTypeAction(*DAG.getContext(),
-                              Op.getValueType()) == TargetLowering::TypeLegal ||
-                              Op.getOpcode() == ISD::TargetConstant) &&
-                              "Unexpected illegal type!");
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Op.getValueType()) ||
+            Op.getOpcode() == ISD::TargetConstant) &&
+            "Unexpected illegal type!");
 #endif
 
   // Figure out the correct action; the way to query this varies by opcode
@@ -1446,6 +1463,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
   // the vector. If all are expanded here, we don't want one store per vector
   // element.
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+
   SDValue StackPtr, Ch;
   for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
        UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
@@ -1460,6 +1482,12 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
       if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
         continue;
 
+      // If the index is dependent on the store we will introduce a cycle when
+      // creating the load (the load uses the index, and by replacing the chain
+      // we will make the index dependent on the load).
+      if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist))
+        continue;
+
       StackPtr = ST->getBasePtr();
       Ch = SDValue(ST, 0);
       break;
@@ -3390,6 +3418,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 97e88bf84a70..bb150f726c23 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -43,10 +43,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 }
 
 //===----------------------------------------------------------------------===//
-//  Result Float to Integer Conversion.
+//  Convert Float Results to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
-void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
         dbgs() << "\n");
   SDValue R = SDValue();
@@ -59,20 +59,26 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
+             "Unsupported SoftenFloatRes opcode!");
+      // Only when isLegalInHWReg, we can skip check of the operands.
+      R = SDValue(N, ResNo);
+      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:
-      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
-      break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -84,7 +90,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -97,9 +103,9 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
@@ -107,11 +113,19 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   }
 
   // If R is null, the sub-method took care of registering the result.
-  if (R.getNode())
+  if (R.getNode()) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
+    ReplaceSoftenFloatResult(N, ResNo, R);
+  }
+  // Return true only if the node is changed,
+  // assuming that the operands are also converted when necessary.
+  // Otherwise, return false to tell caller to scan operands.
+  return R.getNode() && R.getNode() != N;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -130,10 +144,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
-  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(), SDLoc(N),
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, we can load better from the constant pool.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
                          TLI.getTypeToTransformTo(*DAG.getContext(),
-                                                  N->getValueType(0)));
+                                                  CN->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -143,7 +161,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -206,7 +227,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -390,7 +414,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
@@ -580,7 +607,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
+  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -595,7 +623,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
-    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    if (N != NewL.getValue(1).getNode())
+      ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
     return NewL;
   }
 
@@ -609,17 +638,24 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
-  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  if (LegalInHWReg)
+    return ExtendNode;
+  return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -645,7 +681,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
   return NewVAARG;
 }
 
@@ -679,7 +716,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
 
 
 //===----------------------------------------------------------------------===//
-//  Operand Float to Integer Conversion..
+//  Convert Float Operand to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -689,6 +726,8 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   switch (N->getOpcode()) {
   default:
+    if (CanSkipSoftenFloatOperand(N, OpNo))
+      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -704,14 +743,23 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::STORE:
+    Res = SoftenFloatOp_STORE(N, OpNo);
+    // Do not try to analyze or soften this node again if the value is
+    // or can be held in a register. In that case, Res.getNode() should
+    // be equal to N.
+    if (Res.getNode() == N &&
+        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+      return false;
+    // Otherwise, we need to reanalyze and lower the new Res nodes.
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
-  // core about this.
+  // core about this to re-analyze.
   if (Res.getNode() == N)
     return true;
 
@@ -722,6 +770,41 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
+bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+    return false;
+  // When the operand type can be kept in registers, SoftenFloatResult
+  // will call ReplaceValueWith to replace all references and we can
+  // skip softening this operand.
+  switch (N->getOperand(OpNo).getOpcode()) {
+    case ISD::BITCAST:
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+    case ISD::SELECT_CC:
+      return true;
+  }
+  // For some opcodes, SoftenFloatResult handles all conversion of softening
+  // and replacing operands, so that there is no need to soften operands
+  // again, although such opcode could be scanned for other illegal operands.
+  switch (N->getOpcode()) {
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+      return true;
+  }
+  return false;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 337cbe7fc598..d6b4f7921f2b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -238,9 +238,13 @@ bool DAGTypeLegalizer::run() {
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        SoftenFloatResult(N, i);
-        Changed = true;
-        goto NodeDone;
+        Changed = SoftenFloatResult(N, i);
+        if (Changed)
+          goto NodeDone;
+        // If not changed, the result type should be legally in register.
+        assert(isLegalInHWReg(ResultVT) &&
+               "Unchanged SoftenFloatResult should be legal in register!");
+        goto ScanOperands;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -411,18 +415,27 @@ bool DAGTypeLegalizer::run() {
     bool Failed = false;
 
     // Check that all result types are legal.
+    // A value type is illegal if its TypeAction is not TypeLegal,
+    // and TLI.RegClassForVT does not have a register class for this type.
+    // For example, the x86_64 target has f128 that is not TypeLegal,
+    // to have softened operators, but it also has FR128 register class to
+    // pass and return f128 values. Hence a legalized node can have f128 type.
     if (!IgnoreNodeResults(&Node))
       for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(Node.getValueType(i))) {
-          dbgs() << "Result type " << i << " illegal!\n";
+        if (!isTypeLegal(Node.getValueType(i)) &&
+            !TLI.isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump();
           Failed = true;
         }
 
     // Check that all operand types are legal.
     for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
       if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
-          !isTypeLegal(Node.getOperand(i).getValueType())) {
-        dbgs() << "Operand type " << i << " illegal!\n";
+          !isTypeLegal(Node.getOperand(i).getValueType()) &&
+          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump();
         Failed = true;
       }
 
@@ -748,13 +761,23 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  assert(Result.getValueType() ==
-         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+  // f128 of x86_64 could be kept in SSE registers,
+  // but sometimes softened to i128.
+  assert((Result.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = SoftenedFloats[Op];
-  assert(!OpEntry.getNode() && "Node is already converted to integer!");
+  // Allow repeated calls to save f128 type nodes
+  // or any node with type that transforms to itself.
+  // Many operations on these types are not softened.
+  assert((!OpEntry.getNode()||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Node is already converted to integer!");
   OpEntry = Result;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4e4740f1f9cb..84ea374345e9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -72,6 +72,20 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
+  /// isSimpleLegalType - Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  /// isLegalInHWReg - Return true if this type can be passed in registers.
+  /// For example, x86_64's f128, should to be legally in registers
+  /// and only some operations converted to library calls or integer
+  /// bitwise operations.
+  bool isLegalInHWReg(EVT VT) const {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    return VT == NVT && isSimpleLegalType(VT);
+  }
+
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -372,32 +386,48 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
-  /// integer of the same size, this returns the integer.  The integer contains
-  /// exactly the same bits as Op - only the type changed.  For example, if Op
-  /// is an f32 which was softened to an i32, then this method returns an i32,
-  /// the bits of which coincide with those of Op.
+  /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer
+  /// if the Op is not supported in target HW and converted to the integer.
+  /// The integer contains exactly the same bits as Op - only the type changed.
+  /// For example, if Op is an f32 which was softened to an i32, then this method
+  /// returns an i32, the bits of which coincide with those of Op.
+  /// If the Op can be efficiently supported in target HW or the operand must
+  /// stay in a register, the Op is not converted to an integer.
+  /// In that case, the given op is returned.
   SDValue GetSoftenedFloat(SDValue Op) {
     SDValue &SoftenedOp = SoftenedFloats[Op];
+    if (!SoftenedOp.getNode() &&
+        isSimpleLegalType(Op.getValueType()))
+      return Op;
     RemapValue(SoftenedOp);
     assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
     return SoftenedOp;
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Result Float to Integer Conversion.
-  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
+  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
+    // When the result type can be kept in HW registers, the converted
+    // NewRes node could have the same type. We can save the effort in
+    // cloning every user of N in SoftenFloatOperand or other legalization functions,
+    // by calling ReplaceValueWith here to update all users.
+    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
+      ReplaceValueWith(SDValue(N, ResNo), NewRes);
+  }
+
+  // Convert Float Results to Integer for Non-HW-supported Operations.
+  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
-  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -409,7 +439,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -422,14 +452,19 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N);
-  SDValue SoftenFloatRes_SELECT(SDNode *N);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Operand Float to Integer Conversion.
+  // Return true if we can skip softening the given operand or SDNode because
+  // it was soften before by SoftenFloatResult and references to the operand
+  // were replaced by ReplaceValueWith.
+  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
+
+  // Convert Float Operand to Integer for Non-HW-supported Operations.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index eb545982ed02..593c346df770 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -53,12 +53,17 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat:
-      // Convert the integer operand instead.
-      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+    case TargetLowering::TypeSoftenFloat: {
+      // Expand the floating point operand only if it was converted to integers.
+      // Otherwise, it is a legal type like f128 that can be saved in a register.
+      auto SoftenedOp = GetSoftenedFloat(InOp);
+      if (SoftenedOp == InOp)
+        break;
+      SplitInteger(SoftenedOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1fb7b160a671..8295b2a19dd2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -244,7 +244,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
       }
-  } else if (Op.getOpcode() == ISD::MSCATTER)
+  } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE)
     HasVectorValue = true;
 
   for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
@@ -344,6 +344,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::MSCATTER:
     QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType();
     break;
+  case ISD::MSTORE:
+    QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType();
+    break;
   }
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 34e1a7001082..62e7733ecd2b 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -440,7 +440,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -519,7 +519,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 78985e01ef9a..91024e672f9c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1206,7 +1206,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -1335,7 +1335,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
   }
 
@@ -2720,7 +2720,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
                                          ScheduleDAGRRList *scheduleDAG,
                                          const TargetInstrInfo *TII,
                                          const TargetRegisterInfo *TRI) {
-  const uint16_t *ImpDefs
+  const MCPhysReg *ImpDefs
     = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
   const uint32_t *RegMask = getNodeRegMask(SU->getNode());
   if(!ImpDefs && !RegMask)
@@ -2739,7 +2739,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
         return true;
 
       if (ImpDefs)
-        for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+        for (const MCPhysReg *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
           // Return true if SU clobbers this physical register use and the
           // definition of the register reaches from DepSU. IsReachable queries
           // a topological forward sort of the DAG (following the successors).
@@ -2758,13 +2758,13 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
                                   const TargetRegisterInfo *TRI) {
   SDNode *N = SuccSU->getNode();
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
-  const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
   assert(ImpDefs && "Caller should check hasPhysRegDefs");
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (!SUNode->isMachineOpcode())
       continue;
-    const uint16_t *SUImpDefs =
+    const MCPhysReg *SUImpDefs =
       TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
     const uint32_t *SURegMask = getNodeRegMask(SUNode);
     if (!SUImpDefs && !SURegMask)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4b7887b26afe..4596b8eba1a0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2893,8 +2893,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
         return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT);
-      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
         return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
@@ -3336,12 +3338,15 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
       !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize))
     return SDValue();
 
+  // If we are comparing vectors, then the result needs to be a i1 boolean
+  // that is then sign-extended back to the legal result type.
+  EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
-  EVT SVT = VT.getScalarType();
-  EVT LegalSVT = SVT;
-  if (SVT.isInteger()) {
-    LegalSVT = TLI->getTypeToTransformTo(*getContext(), SVT);
+  EVT LegalSVT = VT.getScalarType();
+  if (LegalSVT.isInteger()) {
+    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(SVT))
       return SDValue();
   }
@@ -3378,7 +3383,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
-      ScalarResult = getNode(ISD::ANY_EXTEND, DL, LegalSVT, ScalarResult);
+      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
     if (ScalarResult.getOpcode() != ISD::UNDEF &&
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 85e7e3c1bc8c..867b9562019b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8259,12 +8259,14 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
             JumpProb += DefaultProb / 2;
             FallthroughProb -= DefaultProb / 2;
             JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+            JumpMBB->normalizeSuccProbs();
             break;
           }
         }
 
         addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
         addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+        CurMBB->normalizeSuccProbs();
 
         // The jump table header will be inserted in our current block, do the
         // range check, and fall through to our fallthrough block.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a6f9699bb29c..7c5492b554c2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -369,25 +370,14 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
-namespace {
-class PrintNodeId {
-  const SDNode &Node;
-public:
-  explicit PrintNodeId(const SDNode &Node)
-      : Node(Node) {}
-  void print(raw_ostream &OS) const {
+static Printable PrintNodeId(const SDNode &Node) {
+  return Printable([&Node](raw_ostream &OS) {
 #ifndef NDEBUG
     OS << 't' << Node.PersistentId;
 #else
     OS << (const void*)&Node;
 #endif
-  }
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeId &P) {
-  P.print(OS);
-  return OS;
-}
+  });
 }
 
 void SDNode::dump() const { dump(nullptr); }
@@ -626,7 +616,10 @@ void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
 
 static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
                          const SDValue Value) {
-  if (shouldPrintInline(*Value.getNode())) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  } else if (shouldPrintInline(*Value.getNode())) {
     OS << Value->getOperationName(G) << ':';
     Value->print_types(OS, G);
     Value->print_details(OS, G);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 21935cdd4699..bb31231f4e1a 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1072,7 +1072,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         Op.getOperand(0).getValueType().isFloatingPoint()) {
       bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
-      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
         EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 69c130809bb8..68bca2e70369 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1654,6 +1654,10 @@ TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
     if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
       Cost *= 2;
 
+    // Do not loop with f128 type.
+    if (MTy == LK.second)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
     // Keep legalizing the type.
     MTy = LK.second;
   }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index fe91c86b71f8..0a7042ac3db5 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -40,58 +40,71 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-void PrintReg::print(raw_ostream &OS) const {
-  if (!Reg)
-    OS << "%noreg";
-  else if (TargetRegisterInfo::isStackSlot(Reg))
-    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-  else if (TargetRegisterInfo::isVirtualRegister(Reg))
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
-  else if (TRI && Reg < TRI->getNumRegs())
-    OS << '%' << TRI->getName(Reg);
-  else
-    OS << "%physreg" << Reg;
-  if (SubIdx) {
-    if (TRI)
-      OS << ':' << TRI->getSubRegIndexName(SubIdx);
+namespace llvm {
+
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx) {
+  return Printable([Reg, TRI, SubIdx](raw_ostream &OS) {
+    if (!Reg)
+      OS << "%noreg";
+    else if (TargetRegisterInfo::isStackSlot(Reg))
+      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg))
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TRI && Reg < TRI->getNumRegs())
+      OS << '%' << TRI->getName(Reg);
     else
-      OS << ":sub(" << SubIdx << ')';
-  }
+      OS << "%physreg" << Reg;
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  });
 }
 
-void PrintRegUnit::print(raw_ostream &OS) const {
-  // Generic printout when TRI is missing.
-  if (!TRI) {
-    OS << "Unit~" << Unit;
-    return;
-  }
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
 
-  // Check for invalid register units.
-  if (Unit >= TRI->getNumRegUnits()) {
-    OS << "BadUnit~" << Unit;
-    return;
-  }
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
 
-  // Normal units have at least one root.
-  MCRegUnitRootIterator Roots(Unit, TRI);
-  assert(Roots.isValid() && "Unit has no roots.");
-  OS << TRI->getName(*Roots);
-  for (++Roots; Roots.isValid(); ++Roots)
-    OS << '~' << TRI->getName(*Roots);
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  });
 }
 
-void PrintVRegOrUnit::print(raw_ostream &OS) const {
-  if (TRI && TRI->isVirtualRegister(Unit)) {
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
-    return;
-  }
-  PrintRegUnit::print(OS);
+Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    if (TRI && TRI->isVirtualRegister(Unit)) {
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    } else {
+      OS << PrintRegUnit(Unit, TRI);
+    }
+  });
 }
 
-void PrintLaneMask::print(raw_ostream &OS) const {
-  OS << format("%08X", LaneMask);
+Printable PrintLaneMask(LaneBitmask LaneMask) {
+  return Printable([LaneMask](raw_ostream &OS) {
+    OS << format("%08X", LaneMask);
+  });
 }
 
+} // End of llvm namespace
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
@@ -171,16 +184,24 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI) {
+                                            const TargetRegisterInfo *TRI,
+                                            const MVT::SimpleValueType SVT =
+                                            MVT::SimpleValueType::Any) {
+  const MVT VT(SVT);
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++)
-      return TRI->getRegClass(I + countTrailingZeros(Common));
+    if (unsigned Common = *A++ & *B++) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClass(I + countTrailingZeros(Common));
+      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+        return RC;
+    }
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B) const {
+                                      const TargetRegisterClass *B,
+                                      const MVT::SimpleValueType SVT) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -189,7 +210,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
 }
 
 const TargetRegisterClass *
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index a11b00a926d9..3dc58423df68 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -261,6 +261,12 @@ DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
                           uint32_t *offset_ptr, const DWARFUnit *cu) {
+  return skipValue(form, debug_info_data, offset_ptr, cu->getVersion(),
+                   cu->getAddressByteSize());
+}
+bool DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                               uint32_t *offset_ptr, uint16_t Version,
+                               uint8_t AddrSize) {
   bool indirect = false;
   do {
     switch (form) {
@@ -295,10 +301,10 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 
     // Compile unit address sized values
     case DW_FORM_addr:
-      *offset_ptr += cu->getAddressByteSize();
+      *offset_ptr += AddrSize;
       return true;
     case DW_FORM_ref_addr:
-      *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
+      *offset_ptr += getRefAddrSize(AddrSize, Version);
       return true;
 
     // 0 byte values - implied from the form.
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 1f1921649b57..96b316957dfd 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -133,7 +133,7 @@ void DWARFUnitIndex::dump(raw_ostream &OS) const {
   for (unsigned i = 0; i != Header.NumBuckets; ++i) {
     auto &Row = Rows[i];
     if (auto *Contribs = Row.Contributions.get()) {
-      OS << format("%5u 0x%016" PRIx64 " ", i, Row.Signature);
+      OS << format("%5u 0x%016" PRIx64 " ", i + 1, Row.Signature);
       for (unsigned i = 0; i != Header.NumColumns; ++i) {
         auto &Contrib = Contribs[i];
         OS << format("[0x%08x, 0x%08x) ", Contrib.Offset,
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index f27aa39f2d5d..3c9d2fd50336 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -86,7 +86,7 @@ class MCJIT : public ExecutionEngine {
     ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
     ModulePtrSet::iterator end_added() { return AddedModules.end(); }
     iterator_range<ModulePtrSet::iterator> added() {
-      return iterator_range<ModulePtrSet::iterator>(begin_added(), end_added());
+      return make_range(begin_added(), end_added());
     }
 
     ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 30dbe0383771..34564e42b10f 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -19,8 +19,8 @@
 namespace llvm {
 namespace orc {
 
-void JITCompileCallbackManagerBase::anchor() {}
-void IndirectStubsManagerBase::anchor() {}
+void JITCompileCallbackManager::anchor() {}
+void IndirectStubsManager::anchor() {}
 
 Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) {
   Constant *AddrIntVal =
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
index 7326fa7e2f8f..e519c7f30920 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
@@ -23,7 +23,7 @@ OrcCBindingsStack::createCompileCallbackMgr(Triple T) {
     default: return nullptr;
 
     case Triple::x86_64: {
-      typedef orc::JITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
       return llvm::make_unique<CCMgrT>(0);
     }
   }
@@ -36,7 +36,8 @@ OrcCBindingsStack::createIndirectStubsMgrBuilder(Triple T) {
 
     case Triple::x86_64:
       return [](){
-        return llvm::make_unique<orc::IndirectStubsManager<orc::OrcX86_64>>();
+        return llvm::make_unique<
+                 orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
       };
   }
 }
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index c62210112c35..2e17624ff474 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -29,7 +29,7 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 class OrcCBindingsStack {
 public:
 
-  typedef orc::JITCompileCallbackManagerBase CompileCallbackMgr;
+  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
   typedef orc::ObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr> CODLayerT;
@@ -268,7 +268,7 @@ class OrcCBindingsStack {
   CompileLayerT CompileLayer;
   CODLayerT CODLayer;
 
-  std::unique_ptr<orc::IndirectStubsManagerBase> IndirectStubsMgr;
+  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
 
   std::vector<std::unique_ptr<GenericHandle>> GenericHandles;
   std::vector<unsigned> FreeHandleIndexes;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index dd02ece3a9f1..a95f3bbe4179 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -97,11 +97,11 @@ void RuntimeDyldImpl::resolveRelocations() {
     // The Section here (Sections[i]) refers to the section in which the
     // symbol for the relocation is located.  The SectionID in the relocation
     // entry provides the section to which the relocation will be applied.
-    int Idx = it->getFirst();
+    int Idx = it->first;
     uint64_t Addr = Sections[Idx].getLoadAddress();
     DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
                  << format("%p", (uintptr_t)Addr) << "\n");
-    resolveRelocationList(it->getSecond(), Addr);
+    resolveRelocationList(it->second, Addr);
   }
   Relocations.clear();
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 914efd24660a..dafd3c8793c3 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -30,6 +30,7 @@
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <unordered_map>
 #include <system_error>
 
 using namespace llvm;
@@ -264,7 +265,7 @@ class RuntimeDyldImpl {
   // Relocations to sections already loaded. Indexed by SectionID which is the
   // source of the address. The target where the address will be written is
   // SectionID/Offset in the relocation itself.
-  DenseMap<unsigned, RelocationList> Relocations;
+  std::unordered_map<unsigned, RelocationList> Relocations;
 
   // Relocations to external symbols that are not yet resolved.  Symbols are
   // external when they aren't found in the global symbol table of all loaded
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index fa473811068f..872d2c88e6f2 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -32,23 +32,30 @@ struct FlagDescription {
   int   Default;
   int   *IntFlag;
   const char **StrFlag;
+  unsigned int *UIntFlag;
 };
 
 struct {
 #define FUZZER_FLAG_INT(Name, Default, Description) int Name;
+#define FUZZER_FLAG_UNSIGNED(Name, Default, Description) unsigned int Name;
 #define FUZZER_FLAG_STRING(Name, Description) const char *Name;
 #include "FuzzerFlags.def"
 #undef FUZZER_FLAG_INT
+#undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
 } Flags;
 
 static const FlagDescription FlagDescriptions [] {
 #define FUZZER_FLAG_INT(Name, Default, Description)                            \
-  { #Name, Description, Default, &Flags.Name, nullptr},
+  {#Name, Description, Default, &Flags.Name, nullptr, nullptr},
+#define FUZZER_FLAG_UNSIGNED(Name, Default, Description)                       \
+  {#Name,   Description, static_cast<int>(Default),                            \
+   nullptr, nullptr, &Flags.Name},
 #define FUZZER_FLAG_STRING(Name, Description)                                  \
-  { #Name, Description, 0, nullptr, &Flags.Name },
+  {#Name, Description, 0, nullptr, &Flags.Name, nullptr},
 #include "FuzzerFlags.def"
 #undef FUZZER_FLAG_INT
+#undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
 };
 
@@ -106,6 +113,12 @@ static bool ParseOneFlag(const char *Param) {
         if (Flags.verbosity >= 2)
           Printf("Flag: %s %d\n", Name, Val);;
         return true;
+      } else if (FlagDescriptions[F].UIntFlag) {
+        unsigned int Val = std::stoul(Str);
+        *FlagDescriptions[F].UIntFlag = Val;
+        if (Flags.verbosity >= 2)
+          Printf("Flag: %s %u\n", Name, Val);
+        return true;
       } else if (FlagDescriptions[F].StrFlag) {
         *FlagDescriptions[F].StrFlag = Str;
         if (Flags.verbosity >= 2)
@@ -123,6 +136,9 @@ static void ParseFlags(const std::vector<std::string> &Args) {
   for (size_t F = 0; F < kNumFlags; F++) {
     if (FlagDescriptions[F].IntFlag)
       *FlagDescriptions[F].IntFlag = FlagDescriptions[F].Default;
+    if (FlagDescriptions[F].UIntFlag)
+      *FlagDescriptions[F].UIntFlag =
+          static_cast<unsigned int>(FlagDescriptions[F].Default);
     if (FlagDescriptions[F].StrFlag)
       *FlagDescriptions[F].StrFlag = nullptr;
   }
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 7aea5bf641c3..da7a2d3f5c77 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -11,7 +11,7 @@
 // portability and independence.
 //===----------------------------------------------------------------------===//
 FUZZER_FLAG_INT(verbosity, 1, "Verbosity level.")
-FUZZER_FLAG_INT(seed, 0, "Random seed. If 0, seed is generated.")
+FUZZER_FLAG_UNSIGNED(seed, 0, "Random seed. If 0, seed is generated.")
 FUZZER_FLAG_INT(runs, -1,
             "Number of individual test runs (-1 for infinite runs).")
 FUZZER_FLAG_INT(max_len, 64, "Maximum length of the test input.")
diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
index abc444a3d471..043fad396d51 100644
--- a/lib/Fuzzer/FuzzerIO.cpp
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -91,11 +91,6 @@ std::string DirPlusFile(const std::string &DirPath,
   return DirPath + "/" + FileName;
 }
 
-void PrintFileAsBase64(const std::string &Path) {
-  std::string Cmd = "base64 -w 0 < " + Path + "; echo";
-  ExecuteCommand(Cmd);
-}
-
 void Printf(const char *Fmt, ...) {
   va_list ap;
   va_start(ap, Fmt);
diff --git a/lib/Fuzzer/FuzzerInterface.cpp b/lib/Fuzzer/FuzzerInterface.cpp
index 79cb0e40f0e2..bcd726fc08e4 100644
--- a/lib/Fuzzer/FuzzerInterface.cpp
+++ b/lib/Fuzzer/FuzzerInterface.cpp
@@ -15,7 +15,7 @@
 
 namespace fuzzer {
 
-void FuzzerRandomLibc::ResetSeed(int seed) { srand(seed); }
+void FuzzerRandomLibc::ResetSeed(unsigned int seed) { srand(seed); }
 
 size_t FuzzerRandomLibc::Rand() { return rand(); }
 
diff --git a/lib/Fuzzer/FuzzerInterface.h b/lib/Fuzzer/FuzzerInterface.h
index c96343404f42..c409c06eca19 100644
--- a/lib/Fuzzer/FuzzerInterface.h
+++ b/lib/Fuzzer/FuzzerInterface.h
@@ -50,7 +50,7 @@ class FuzzerRandomBase {
  public:
   FuzzerRandomBase(){}
   virtual ~FuzzerRandomBase(){};
-  virtual void ResetSeed(int seed) = 0;
+  virtual void ResetSeed(unsigned int seed) = 0;
   // Return a random number.
   virtual size_t Rand() = 0;
   // Return a random number in range [0,n).
@@ -60,8 +60,8 @@ class FuzzerRandomBase {
 
 class FuzzerRandomLibc : public FuzzerRandomBase {
  public:
-  FuzzerRandomLibc(int seed) { ResetSeed(seed); }
-  void ResetSeed(int seed) override;
+  FuzzerRandomLibc(unsigned int seed) { ResetSeed(seed); }
+  void ResetSeed(unsigned int seed) override;
   ~FuzzerRandomLibc() override {}
   size_t Rand() override;
 };
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index 2c382b2ef314..bc6bec7473d3 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -42,7 +42,7 @@ void Print(const Unit &U, const char *PrintAfter = "");
 void PrintASCII(const Unit &U, const char *PrintAfter = "");
 std::string Hash(const Unit &U);
 void SetTimer(int Seconds);
-void PrintFileAsBase64(const std::string &Path);
+std::string Base64(const Unit &U);
 int ExecuteCommand(const std::string &Command);
 
 // Private copy of SHA1 implementation.
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 9c52a4dbe774..889c30c87489 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -302,10 +302,8 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
   WriteToFile(U, Path);
   Printf("artifact_prefix='%s'; Test unit written to %s\n",
          Options.ArtifactPrefix.c_str(), Path.c_str());
-  if (U.size() <= kMaxUnitSizeToPrint) {
-    Printf("Base64: ");
-    PrintFileAsBase64(Path);
-  }
+  if (U.size() <= kMaxUnitSizeToPrint)
+    Printf("Base64: %s\n", Base64(U).c_str());
 }
 
 void Fuzzer::SaveCorpus() {
@@ -470,10 +468,15 @@ void Fuzzer::Drill() {
 }
 
 void Fuzzer::Loop() {
+  system_clock::time_point LastCorpusReload = system_clock::now();
   while (true) {
     size_t J1 = ChooseUnitIdxToMutate();;
     SyncCorpus();
-    RereadOutputCorpus();
+    auto Now = system_clock::now();
+    if (duration_cast<seconds>(Now - LastCorpusReload).count()) {
+      RereadOutputCorpus();
+      LastCorpusReload = Now;
+    }
     if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
       break;
     if (Options.MaxTotalTimeSec > 0 &&
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
index 20a41e0d4fbb..6c1133fffd37 100644
--- a/lib/Fuzzer/FuzzerUtil.cpp
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -167,4 +167,33 @@ bool ParseDictionaryFile(const std::string &Text, std::vector<Unit> *Units) {
 
 int GetPid() { return getpid(); }
 
+
+std::string Base64(const Unit &U) {
+  static const char Table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                              "abcdefghijklmnopqrstuvwxyz"
+                              "0123456789+/";
+  std::string Res;
+  size_t i;
+  for (i = 0; i + 2 < U.size(); i += 3) {
+    uint32_t x = (U[i] << 16) + (U[i + 1] << 8) + U[i + 2];
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += Table[(x >> 6) & 63];
+    Res += Table[x & 63];
+  }
+  if (i + 1 == U.size()) {
+    uint32_t x = (U[i] << 16);
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += "==";
+  } else if (i + 2 == U.size()) {
+    uint32_t x = (U[i] << 16) + (U[i + 1] << 8);
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += Table[(x >> 6) & 63];
+    Res += "=";
+  }
+  return Res;
+}
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 4a96468f8d7a..b92e61877c6c 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -360,3 +360,16 @@ TEST(FuzzerDictionary, ParseDictionaryFile) {
   EXPECT_EQ(Units,
             std::vector<Unit>({Unit({'a', 'a'}), Unit({'a', 'b', 'c'})}));
 }
+
+TEST(FuzzerUtil, Base64) {
+  EXPECT_EQ("", Base64({}));
+  EXPECT_EQ("YQ==", Base64({'a'}));
+  EXPECT_EQ("eA==", Base64({'x'}));
+  EXPECT_EQ("YWI=", Base64({'a', 'b'}));
+  EXPECT_EQ("eHk=", Base64({'x', 'y'}));
+  EXPECT_EQ("YWJj", Base64({'a', 'b', 'c'}));
+  EXPECT_EQ("eHl6", Base64({'x', 'y', 'z'}));
+  EXPECT_EQ("YWJjeA==", Base64({'a', 'b', 'c', 'x'}));
+  EXPECT_EQ("YWJjeHk=", Base64({'a', 'b', 'c', 'x', 'y'}));
+  EXPECT_EQ("YWJjeHl6", Base64({'a', 'b', 'c', 'x', 'y', 'z'}));
+}
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index 3b7045d8de8d..6515628d1086 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -48,8 +48,8 @@ RUN: not LLVMFuzzer-SimpleCmpTest -use_traces=1 -seed=1 -runs=1000000 -timeout=5
 
 RUN: not LLVMFuzzer-UserSuppliedFuzzerTest -seed=1 -timeout=15 2>&1 | FileCheck %s
 
-RUN: not LLVMFuzzer-MemcmpTest -use_traces=1 -seed=1 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-MemcmpTest               -seed=1 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+RUN: not LLVMFuzzer-MemcmpTest -use_traces=1 -seed=4294967295 -runs=100000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-MemcmpTest               -seed=4294967295 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
 Done1000000: Done 1000000 runs in
 
 RUN: not LLVMFuzzer-StrncmpTest -use_traces=1 -seed=1 -runs=100000   2>&1 | FileCheck %s
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 759c5a8001c0..e41815aafa80 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -303,6 +304,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::PreserveMost:  Out << "preserve_mostcc"; break;
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
+  case CallingConv::CXX_FAST_TLS:  Out << "cxx_fast_tlscc"; break;
   case CallingConv::GHC:           Out << "ghccc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
@@ -1108,11 +1110,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // the value back and get the same value.
       //
       bool ignored;
-      bool isHalf = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEhalf;
       bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
       bool isInf = CFP->getValueAPF().isInfinity();
       bool isNaN = CFP->getValueAPF().isNaN();
-      if (!isHalf && !isInf && !isNaN) {
+      if (!isInf && !isNaN) {
         double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
                                 CFP->getValueAPF().convertToFloat();
         SmallString<128> StrVal;
@@ -1138,15 +1139,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // x86, so we must not use these types.
       static_assert(sizeof(double) == sizeof(uint64_t),
                     "assuming that double is 64 bits!");
-      char Buffer[40];
       APFloat apf = CFP->getValueAPF();
-      // Halves and floats are represented in ASCII IR as double, convert.
+      // Floats are represented in ASCII IR as double, convert.
       if (!isDouble)
         apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
                           &ignored);
-      Out << "0x" <<
-              utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()),
-                            Buffer+40);
+      Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
       return;
     }
 
@@ -1154,60 +1152,32 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     // These appear as a magic letter identifying the type, then a
     // fixed number of hex digits.
     Out << "0x";
-    // Bit position, in the current word, of the next nibble to print.
-    int shiftcount;
-
+    APInt API = CFP->getValueAPF().bitcastToAPInt();
     if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
       Out << 'K';
-      // api needed to prevent premature destruction
-      APInt api = CFP->getValueAPF().bitcastToAPInt();
-      const uint64_t* p = api.getRawData();
-      uint64_t word = p[1];
-      shiftcount = 12;
-      int width = api.getBitWidth();
-      for (int j=0; j<width; j+=4, shiftcount-=4) {
-        unsigned int nibble = (word>>shiftcount) & 15;
-        if (nibble < 10)
-          Out << (unsigned char)(nibble + '0');
-        else
-          Out << (unsigned char)(nibble - 10 + 'A');
-        if (shiftcount == 0 && j+4 < width) {
-          word = *p;
-          shiftcount = 64;
-          if (width-j-4 < 64)
-            shiftcount = width-j-4;
-        }
-      }
+      Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
       return;
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
-      shiftcount = 60;
       Out << 'L';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
-      shiftcount = 60;
       Out << 'M';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
-      shiftcount = 12;
       Out << 'H';
+      Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                  /*Upper=*/true);
     } else
       llvm_unreachable("Unsupported floating point type");
-    // api needed to prevent premature destruction
-    APInt api = CFP->getValueAPF().bitcastToAPInt();
-    const uint64_t* p = api.getRawData();
-    uint64_t word = *p;
-    int width = api.getBitWidth();
-    for (int j=0; j<width; j+=4, shiftcount-=4) {
-      unsigned int nibble = (word>>shiftcount) & 15;
-      if (nibble < 10)
-        Out << (unsigned char)(nibble + '0');
-      else
-        Out << (unsigned char)(nibble - 10 + 'A');
-      if (shiftcount == 0 && j+4 < width) {
-        word = *(++p);
-        shiftcount = 64;
-        if (width-j-4 < 64)
-          shiftcount = width-j-4;
-      }
-    }
     return;
   }
 
@@ -1426,6 +1396,7 @@ struct MDFieldPrinter {
       : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
   }
   void printTag(const DINode *N);
+  void printMacinfoType(const DIMacroNode *N);
   void printString(StringRef Name, StringRef Value,
                    bool ShouldSkipEmpty = true);
   void printMetadata(StringRef Name, const Metadata *MD,
@@ -1448,6 +1419,14 @@ void MDFieldPrinter::printTag(const DINode *N) {
     Out << N->getTag();
 }
 
+void MDFieldPrinter::printMacinfoType(const DIMacroNode *N) {
+  Out << FS << "type: ";
+  if (const char *Type = dwarf::MacinfoString(N->getMacinfoType()))
+    Out << Type;
+  else
+    Out << N->getMacinfoType();
+}
+
 void MDFieldPrinter::printString(StringRef Name, StringRef Value,
                                  bool ShouldSkipEmpty) {
   if (ShouldSkipEmpty && Value.empty())
@@ -1673,6 +1652,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("subprograms", N->getRawSubprograms());
   Printer.printMetadata("globals", N->getRawGlobalVariables());
   Printer.printMetadata("imports", N->getRawImportedEntities());
+  Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Out << ")";
 }
@@ -1741,6 +1721,29 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
   Out << ")";
 }
 
+static void writeDIMacro(raw_ostream &Out, const DIMacro *N,
+                         TypePrinting *TypePrinter, SlotTracker *Machine,
+                         const Module *Context) {
+  Out << "!DIMacro(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printMacinfoType(N);
+  Printer.printInt("line", N->getLine());
+  Printer.printString("name", N->getName());
+  Printer.printString("value", N->getValue());
+  Out << ")";
+}
+
+static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N,
+                             TypePrinting *TypePrinter, SlotTracker *Machine,
+                             const Module *Context) {
+  Out << "!DIMacroFile(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printInt("line", N->getLine());
+  Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
+  Printer.printMetadata("nodes", N->getRawElements());
+  Out << ")";
+}
+
 static void writeDIModule(raw_ostream &Out, const DIModule *N,
                           TypePrinting *TypePrinter, SlotTracker *Machine,
                           const Module *Context) {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index b4a07a1b6b4a..36282c164293 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -899,7 +899,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
     else if (CI->getType()->isIntegerTy(64))
       return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
   } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-    if (CFP->getType()->isFloatTy())
+    if (CFP->getType()->isHalfTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CFP->getType()->isFloatTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
     else if (CFP->getType()->isDoubleTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
@@ -2365,7 +2367,7 @@ StringRef ConstantDataSequential::getRawDataValues() const {
 /// ConstantDataArray only works with normal float and int types that are
 /// stored densely in memory, not with things like i42 or x86_f80.
 bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
-  if (Ty->isFloatTy() || Ty->isDoubleTy()) return true;
+  if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true;
   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
     switch (IT->getBitWidth()) {
     case 8:
@@ -2521,7 +2523,7 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
 /// object.
 Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint16_t> Elts) {
-  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
 }
@@ -2637,6 +2639,11 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
   }
 
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType()->isHalfTy()) {
+      SmallVector<uint16_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getContext(), Elts);
+    }
     if (CFP->getType()->isFloatTy()) {
       SmallVector<uint32_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
@@ -2682,6 +2689,10 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   switch (getElementType()->getTypeID()) {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
+  case Type::HalfTyID: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APFloat(APFloat::IEEEhalf, APInt(16, EltVal));
+  }
   case Type::FloatTyID: {
     auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
     return APFloat(APFloat::IEEEsingle, APInt(32, EltVal));
@@ -2716,7 +2727,8 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 /// Note that this has to compute a new constant to return, so it isn't as
 /// efficient as getElementAsInteger/Float/Double.
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
-  if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
+  if (getElementType()->isHalfTy() || getElementType()->isFloatTy() ||
+      getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
 
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 09b540350c28..b7841fe2b85c 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -148,7 +148,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, DIFile::get(VMContext, Filename, Directory), Producer,
       isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr,
-      nullptr, nullptr, nullptr, nullptr, DWOId);
+      nullptr, nullptr, nullptr, nullptr, nullptr, DWOId);
 
   // Create a named metadata so that it is easier to find cu in a module.
   // Note that we only generate this when the caller wants to actually
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cead10652e07..58e0abdd577c 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -315,7 +315,7 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *Subprograms, Metadata *GlobalVariables,
-    Metadata *ImportedEntities, uint64_t DWOId,
+    Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
     StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
@@ -324,7 +324,7 @@ DICompileUnit *DICompileUnit::getImpl(
 
   Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes,
                      RetainedTypes, Subprograms, GlobalVariables,
-                     ImportedEntities};
+                     ImportedEntities, Macros};
   return storeImpl(new (ArrayRef<Metadata *>(Ops).size()) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, Ops),
@@ -557,3 +557,24 @@ DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
   Metadata *Ops[] = {Scope, Entity, Name};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
+
+DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType,
+                          unsigned Line, MDString *Name, MDString *Value,
+                          StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DIMacro,
+                        (MIType, Line, getString(Name), getString(Value)));
+  Metadata *Ops[] = { Name, Value };
+  DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops);
+}
+
+DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType,
+                                  unsigned Line, Metadata *File,
+                                  Metadata *Elements, StorageType Storage,
+                                  bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DIMacroFile,
+                        (MIType, Line, File, Elements));
+  Metadata *Ops[] = { File, Elements };
+  DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops);
+}
+
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 4753789d9c13..6426f76bbaa6 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -132,6 +132,12 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
+void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
+  if (getFileName())
+    DP << getFileName() << ": ";
+  DP << getMsg();
+}
+
 bool DiagnosticInfoOptimizationBase::isLocationAvailable() const {
   return getDebugLoc();
 }
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index b5a30a4969b3..7bd50328b126 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -62,6 +62,11 @@ Module *Instruction::getModule() {
   return getParent()->getModule();
 }
 
+Function *Instruction::getFunction() { return getParent()->getParent(); }
+
+const Function *Instruction::getFunction() const {
+  return getParent()->getParent();
+}
 
 void Instruction::removeFromParent() {
   getParent()->getInstList().remove(getIterator());
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index b8c72dd7e39d..f185caacdf6c 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -299,14 +299,14 @@ CallInst::CallInst(const CallInst &CI)
 
 CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
                            Instruction *InsertPt) {
-  CallSite CS(CI);
-  std::vector<Value *> Args(CS.arg_begin(), CS.arg_end());
+  std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
 
   auto *NewCI = CallInst::Create(CI->getCalledValue(), Args, OpB, CI->getName(),
                                  InsertPt);
   NewCI->setTailCallKind(CI->getTailCallKind());
   NewCI->setCallingConv(CI->getCallingConv());
   NewCI->SubclassOptionalData = CI->SubclassOptionalData;
+  NewCI->setAttributes(CI->getAttributes());
   return NewCI;
 }
 
@@ -369,7 +369,7 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either a call argument or an operand bundle!");
-  return getOperandBundleForOperand(i - 1).operandsHaveAttr(A);
+  return bundleOperandHasAttr(i - 1, A);
 }
 
 /// IsConstantOne - Return true only if val is constant int 1
@@ -586,14 +586,14 @@ InvokeInst::InvokeInst(const InvokeInst &II)
 
 InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
-  CallSite CS(II);
-  std::vector<Value *> Args(CS.arg_begin(), CS.arg_end());
+  std::vector<Value *> Args(II->arg_begin(), II->arg_end());
 
   auto *NewII = InvokeInst::Create(II->getCalledValue(), II->getNormalDest(),
                                    II->getUnwindDest(), Args, OpB,
                                    II->getName(), InsertPt);
   NewII->setCallingConv(II->getCallingConv());
   NewII->SubclassOptionalData = II->SubclassOptionalData;
+  NewII->setAttributes(II->getAttributes());
   return NewII;
 }
 
@@ -646,7 +646,7 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either an invoke argument or an operand bundle!");
-  return getOperandBundleForOperand(i - 1).operandsHaveAttr(A);
+  return bundleOperandHasAttr(i - 1, A);
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 7e89b582cbdd..ae987e65bcb1 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -792,6 +792,49 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DIMacro> {
+  unsigned MIType;
+  unsigned Line;
+  StringRef Name;
+  StringRef Value;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, StringRef Name, StringRef Value)
+      : MIType(MIType), Line(Line), Name(Name), Value(Value) {}
+  MDNodeKeyImpl(const DIMacro *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getName()),
+        Value(N->getValue()) {}
+
+  bool isKeyOf(const DIMacro *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           Name == RHS->getName() && Value == RHS->getValue();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, Name, Value);
+  }
+};
+
+template <> struct MDNodeKeyImpl<DIMacroFile> {
+  unsigned MIType;
+  unsigned Line;
+  Metadata *File;
+  Metadata *Elements;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, Metadata *File,
+                Metadata *Elements)
+      : MIType(MIType), Line(Line), File(File), Elements(Elements) {}
+  MDNodeKeyImpl(const DIMacroFile *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), File(N->getRawFile()),
+        Elements(N->getRawElements()) {}
+
+  bool isKeyOf(const DIMacroFile *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           File == RHS->getRawFile() && File == RHS->getRawElements();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, File, Elements);
+  }
+};
+
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
   typedef MDNodeKeyImpl<NodeTy> KeyTy;
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 69f402029c81..f2e0c7d32c02 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -569,13 +569,33 @@ void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
 
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
   AnalysisUsage *AnUsage = nullptr;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  auto DMI = AnUsageMap.find(P);
   if (DMI != AnUsageMap.end())
     AnUsage = DMI->second;
   else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
+    // Look up the analysis usage from the pass instance (different instances
+    // of the same pass can produce different results), but unique the
+    // resulting object to reduce memory usage.  This helps to greatly reduce
+    // memory usage when we have many instances of only a few pass types
+    // (e.g. instcombine, simplifycfg, etc...) which tend to share a fixed set
+    // of dependencies.
+    AnalysisUsage AU;
+    P->getAnalysisUsage(AU);
+    
+    AUFoldingSetNode* Node = nullptr;
+    FoldingSetNodeID ID;
+    AUFoldingSetNode::Profile(ID, AU);
+    void *IP = nullptr;
+    if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
+      Node = N;
+    else {
+      Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU);
+      UniqueAnalysisUsages.InsertNode(Node, IP);
+    }
+    assert(Node && "cached analysis usage must be non null");
+
+    AnUsageMap[P] = &Node->AU;
+    AnUsage = &Node->AU;;
   }
   return AnUsage;
 }
@@ -798,10 +818,6 @@ PMTopLevelManager::~PMTopLevelManager() {
   for (SmallVectorImpl<ImmutablePass *>::iterator
          I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 2b9adad44ba7..2acd9db210db 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -491,3 +491,15 @@ PICLevel::Level Module::getPICLevel() const {
 void Module::setPICLevel(PICLevel::Level PL) {
   addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
 }
+
+void Module::setMaximumFunctionCount(uint64_t Count) {
+  addModuleFlag(ModFlagBehavior::Error, "MaxFunctionCount", Count);
+}
+
+Optional<uint64_t> Module::getMaximumFunctionCount() {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("MaxFunctionCount"));
+  if (!Val)
+    return None;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 5cbb597ca269..58f9c5388bf5 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -204,6 +204,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
 
+  /// Cache of constants visited in search of ConstantExprs.
+  SmallPtrSet<const Constant *, 32> ConstantExprVisited;
+
 public:
   explicit Verifier(raw_ostream &OS)
       : VerifierSupport(OS), Context(nullptr), LandingPadResultTy(nullptr),
@@ -420,7 +423,8 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void VerifyFunctionMetadata(
       const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs);
 
-  void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+  void visitConstantExprsRecursively(const Constant *EntryC);
+  void visitConstantExpr(const ConstantExpr *CE);
   void VerifyStatepoint(ImmutableCallSite CS);
   void verifyFrameRecoverIndices();
 
@@ -545,25 +549,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   }
 
   // Walk any aggregate initializers looking for bitcasts between address spaces
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> WorkStack;
-  WorkStack.push_back(cast<Value>(GV.getInitializer()));
-
-  while (!WorkStack.empty()) {
-    const Value *V = WorkStack.pop_back_val();
-    if (!Visited.insert(V).second)
-      continue;
-
-    if (const User *U = dyn_cast<User>(V)) {
-      WorkStack.append(U->op_begin(), U->op_end());
-    }
-
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-      VerifyConstantExprBitcastType(CE);
-      if (Broken)
-        return;
-    }
-  }
+  visitConstantExprsRecursively(GV.getInitializer());
 
   visitGlobalValue(GV);
 }
@@ -593,7 +579,7 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
   }
 
   if (const auto *CE = dyn_cast<ConstantExpr>(&C))
-    VerifyConstantExprBitcastType(CE);
+    visitConstantExprsRecursively(CE);
 
   for (const Use &U : C.operands()) {
     Value *V = &*U;
@@ -860,8 +846,6 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
          "invalid composite elements", &N, N.getRawElements());
   Assert(isTypeRef(N, N.getRawVTableHolder()), "invalid vtable holder", &N,
          N.getRawVTableHolder());
-  Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
-         "invalid composite elements", &N, N.getRawElements());
   Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
          &N);
   if (auto *Params = N.getRawTemplateParams())
@@ -935,6 +919,12 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) {
              Op);
     }
   }
+  if (auto *Array = N.getRawMacros()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getMacros()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
 }
 
 void Verifier::visitDISubprogram(const DISubprogram &N) {
@@ -988,6 +978,27 @@ void Verifier::visitDINamespace(const DINamespace &N) {
     Assert(isa<DIScope>(S), "invalid scope ref", &N, S);
 }
 
+void Verifier::visitDIMacro(const DIMacro &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
+         N.getMacinfoType() == dwarf::DW_MACINFO_undef,
+         "invalid macinfo type", &N);
+  Assert(!N.getName().empty(), "anonymous macro", &N);
+}
+
+void Verifier::visitDIMacroFile(const DIMacroFile &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
+         "invalid macinfo type", &N);
+  if (auto *F = N.getRawFile())
+    Assert(isa<DIFile>(F), "invalid file", &N, F);
+
+  if (auto *Array = N.getRawElements()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getElements()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
+}
+
 void Verifier::visitDIModule(const DIModule &N) {
   Assert(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
   Assert(!N.getName().empty(), "anonymous module", &N);
@@ -1468,7 +1479,35 @@ void Verifier::VerifyFunctionMetadata(
   }
 }
 
-void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
+void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return;
+
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C))
+      visitConstantExpr(CE);
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+      if (isa<GlobalValue>(OpC))
+        continue; // Global values get visited separately.
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+      Stack.push_back(OpC);
+    }
+  }
+}
+
+void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() != Instruction::BitCast)
     return;
 
@@ -1814,7 +1853,10 @@ void Verifier::visitFunction(const Function &F) {
         continue;
 
       DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr;
-      if (SP && !Seen.insert(SP).second)
+
+      // Scope and SP could be the same MDNode and we don't want to skip
+      // validation in that case
+      if (SP && ((Scope != SP) && !Seen.insert(SP).second))
         continue;
 
       // FIXME: Once N is canonical, check "SP == &N".
@@ -3191,22 +3233,7 @@ void Verifier::visitInstruction(Instruction &I) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
         // illegal bitcast (inttoptr <constant int> )
-        SmallVector<const ConstantExpr *, 4> Stack;
-        SmallPtrSet<const ConstantExpr *, 4> Visited;
-        Stack.push_back(CE);
-
-        while (!Stack.empty()) {
-          const ConstantExpr *V = Stack.pop_back_val();
-          if (!Visited.insert(V).second)
-            continue;
-
-          VerifyConstantExprBitcastType(V);
-
-          for (unsigned I = 0, N = V->getNumOperands(); I != N; ++I) {
-            if (ConstantExpr *Op = dyn_cast<ConstantExpr>(V->getOperand(I)))
-              Stack.push_back(Op);
-          }
-        }
+        visitConstantExprsRecursively(CE);
       }
     }
   }
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 468ec24e3a06..bf3cde59443d 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -64,17 +64,11 @@ const char* LTOCodeGenerator::getVersionString() {
 #endif
 }
 
-LTOCodeGenerator::LTOCodeGenerator()
-    : Context(getGlobalContext()),
-      MergedModule(new Module("ld-temp.o", Context)),
-      IRLinker(new Linker(*MergedModule)) {
-  initializeLTOPasses();
-}
-
-LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
-    : OwnedContext(std::move(Context)), Context(*OwnedContext),
-      MergedModule(new Module("ld-temp.o", *OwnedContext)),
-      IRLinker(new Linker(*MergedModule)) {
+LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
+    : Context(Context), MergedModule(new Module("ld-temp.o", Context)),
+      IRLinker(new Linker(*MergedModule, [this](const DiagnosticInfo &DI) {
+        MergedModule->getContext().diagnose(DI);
+      })) {
   initializeLTOPasses();
 }
 
@@ -130,7 +124,8 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker = make_unique<Linker>(*MergedModule);
+  IRLinker = llvm::make_unique<Linker>(*MergedModule,
+                                       IRLinker->getDiagnosticHandler());
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index d28563c23b80..a6a3002e457d 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -100,89 +100,72 @@ std::string LTOModule::getProducerString(MemoryBuffer *Buffer) {
   return getBitcodeProducerString(*BCOrErr, Context);
 }
 
-LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options,
-                                     std::string &errMsg) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromFile(LLVMContext &Context, const char *path,
+                          TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(path);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size,
-                                         TargetOptions options,
-                                         std::string &errMsg) {
-  return createFromOpenFileSlice(fd, path, size, 0, options, errMsg);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFile(LLVMContext &Context, int fd, const char *path,
+                              size_t size, TargetOptions options) {
+  return createFromOpenFileSlice(Context, fd, path, size, 0, options);
 }
 
-LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path,
-                                              size_t map_size, off_t offset,
-                                              TargetOptions options,
-                                              std::string &errMsg) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFileSlice(LLVMContext &Context, int fd,
+                                   const char *path, size_t map_size,
+                                   off_t offset, TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length,
-                                       TargetOptions options,
-                                       std::string &errMsg, StringRef path) {
-  return createInContext(mem, length, options, errMsg, path,
-                         &getGlobalContext());
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromBuffer(LLVMContext &Context, const void *mem,
+                            size_t length, TargetOptions options,
+                            StringRef path) {
+  return createInContext(mem, length, options, path, &Context);
 }
 
-LTOModule *LTOModule::createInLocalContext(const void *mem, size_t length,
-                                           TargetOptions options,
-                                           std::string &errMsg,
-                                           StringRef path) {
-  return createInContext(mem, length, options, errMsg, path, nullptr);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInLocalContext(const void *mem, size_t length,
+                                TargetOptions options, StringRef path) {
+  return createInContext(mem, length, options, path, nullptr);
 }
 
-LTOModule *LTOModule::createInContext(const void *mem, size_t length,
-                                      TargetOptions options,
-                                      std::string &errMsg, StringRef path,
-                                      LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInContext(const void *mem, size_t length,
+                           TargetOptions options, StringRef path,
+                           LLVMContext *Context) {
   StringRef Data((const char *)mem, length);
   MemoryBufferRef Buffer(Data, path);
-  return makeLTOModule(Buffer, options, errMsg, Context);
+  return makeLTOModule(Buffer, options, Context);
 }
 
-static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer,
-                                                    LLVMContext &Context,
-                                                    bool ShouldBeLazy,
-                                                    std::string &ErrMsg) {
+static ErrorOr<std::unique_ptr<Module>>
+parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context,
+                     bool ShouldBeLazy) {
 
   // Find the buffer.
   ErrorOr<MemoryBufferRef> MBOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer);
-  if (std::error_code EC = MBOrErr.getError()) {
-    ErrMsg = EC.message();
-    return nullptr;
-  }
-
-  std::function<void(const DiagnosticInfo &)> DiagnosticHandler =
-      [&ErrMsg](const DiagnosticInfo &DI) {
-        raw_string_ostream Stream(ErrMsg);
-        DiagnosticPrinterRawOStream DP(Stream);
-        DI.print(DP);
-      };
+  if (std::error_code EC = MBOrErr.getError())
+    return EC;
 
   if (!ShouldBeLazy) {
     // Parse the full file.
-    ErrorOr<std::unique_ptr<Module>> M =
-        parseBitcodeFile(*MBOrErr, Context, DiagnosticHandler);
-    if (!M)
-      return nullptr;
+    ErrorOr<std::unique_ptr<Module>> M = parseBitcodeFile(*MBOrErr, Context);
+    if (std::error_code EC = M.getError())
+      return EC;
     return std::move(*M);
   }
 
@@ -190,16 +173,16 @@ static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer,
   std::unique_ptr<MemoryBuffer> LightweightBuf =
       MemoryBuffer::getMemBuffer(*MBOrErr, false);
   ErrorOr<std::unique_ptr<Module>> M =
-      getLazyBitcodeModule(std::move(LightweightBuf), Context,
-                           DiagnosticHandler, true /*ShouldLazyLoadMetadata*/);
-  if (!M)
-    return nullptr;
+      getLazyBitcodeModule(std::move(LightweightBuf), Context, nullptr,
+                           true /*ShouldLazyLoadMetadata*/);
+  if (std::error_code EC = M.getError())
+    return EC;
   return std::move(*M);
 }
 
-LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
-                                    TargetOptions options, std::string &errMsg,
-                                    LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                         LLVMContext *Context) {
   std::unique_ptr<LLVMContext> OwnedContext;
   if (!Context) {
     OwnedContext = llvm::make_unique<LLVMContext>();
@@ -208,11 +191,12 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
 
   // If we own a context, we know this is being used only for symbol
   // extraction, not linking.  Be lazy in that case.
-  std::unique_ptr<Module> M = parseBitcodeFileImpl(
-      Buffer, *Context,
-      /* ShouldBeLazy */ static_cast<bool>(OwnedContext), errMsg);
-  if (!M)
-    return nullptr;
+  ErrorOr<std::unique_ptr<Module>> MOrErr =
+      parseBitcodeFileImpl(Buffer, *Context,
+                           /* ShouldBeLazy */ static_cast<bool>(OwnedContext));
+  if (std::error_code EC = MOrErr.getError())
+    return EC;
+  std::unique_ptr<Module> &M = *MOrErr;
 
   std::string TripleStr = M->getTargetTriple();
   if (TripleStr.empty())
@@ -220,9 +204,10 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   llvm::Triple Triple(TripleStr);
 
   // find machine architecture for this module
+  std::string errMsg;
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
-    return nullptr;
+    return std::unique_ptr<LTOModule>(nullptr);
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
@@ -246,20 +231,16 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   std::unique_ptr<object::IRObjectFile> IRObj(
       new object::IRObjectFile(Buffer, std::move(M)));
 
-  LTOModule *Ret;
+  std::unique_ptr<LTOModule> Ret;
   if (OwnedContext)
-    Ret = new LTOModule(std::move(IRObj), target, std::move(OwnedContext));
+    Ret.reset(new LTOModule(std::move(IRObj), target, std::move(OwnedContext)));
   else
-    Ret = new LTOModule(std::move(IRObj), target);
-
-  if (Ret->parseSymbols(errMsg)) {
-    delete Ret;
-    return nullptr;
-  }
+    Ret.reset(new LTOModule(std::move(IRObj), target));
 
+  Ret->parseSymbols();
   Ret->parseMetadata();
 
-  return Ret;
+  return std::move(Ret);
 }
 
 /// Create a MemoryBuffer from a memory range with an optional name.
@@ -592,9 +573,7 @@ void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
   info.symbol = decl;
 }
 
-/// parseSymbols - Parse the symbols from the module and model-level ASM and add
-/// them to either the defined or undefined lists.
-bool LTOModule::parseSymbols(std::string &errMsg) {
+void LTOModule::parseSymbols() {
   for (auto &Sym : IRFile->symbols()) {
     const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
     uint32_t Flags = Sym.getFlags();
@@ -649,8 +628,6 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
     NameAndAttributes info = u->getValue();
     _symbols.push_back(info);
   }
-
-  return false;
 }
 
 /// parseMetadata - Parse metadata from the module
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
index f9d8e0925ae3..8916fb3f7251 100644
--- a/lib/Linker/CMakeLists.txt
+++ b/lib/Linker/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMLinker
+  IRMover.cpp
   LinkModules.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
new file mode 100644
index 000000000000..251cfb71894b
--- /dev/null
+++ b/lib/Linker/IRMover.cpp
@@ -0,0 +1,1398 @@
+//===- lib/Linker/IRMover.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker/IRMover.h"
+#include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TypeMap implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class TypeMapTy : public ValueMapTypeRemapper {
+  /// This is a mapping from a source type to a destination type to use.
+  DenseMap<Type *, Type *> MappedTypes;
+
+  /// When checking to see if two subgraphs are isomorphic, we speculatively
+  /// add types to MappedTypes, but keep track of them here in case we need to
+  /// roll back.
+  SmallVector<Type *, 16> SpeculativeTypes;
+
+  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
+
+  /// This is a list of non-opaque structs in the source module that are mapped
+  /// to an opaque struct in the destination module.
+  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
+
+  /// This is the set of opaque types in the destination modules who are
+  /// getting a body from the source module.
+  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
+
+public:
+  TypeMapTy(IRMover::IdentifiedStructTypeSet &DstStructTypesSet)
+      : DstStructTypesSet(DstStructTypesSet) {}
+
+  IRMover::IdentifiedStructTypeSet &DstStructTypesSet;
+  /// Indicate that the specified type in the destination module is conceptually
+  /// equivalent to the specified type in the source module.
+  void addTypeMapping(Type *DstTy, Type *SrcTy);
+
+  /// Produce a body for an opaque type in the dest module from a type
+  /// definition in the source module.
+  void linkDefinedTypeBodies();
+
+  /// Return the mapped type to use for the specified input type from the
+  /// source module.
+  Type *get(Type *SrcTy);
+  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
+
+  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
+
+  FunctionType *get(FunctionType *T) {
+    return cast<FunctionType>(get((Type *)T));
+  }
+
+private:
+  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
+
+  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
+};
+}
+
+void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
+  assert(SpeculativeTypes.empty());
+  assert(SpeculativeDstOpaqueTypes.empty());
+
+  // Check to see if these types are recursively isomorphic and establish a
+  // mapping between them if so.
+  if (!areTypesIsomorphic(DstTy, SrcTy)) {
+    // Oops, they aren't isomorphic.  Just discard this request by rolling out
+    // any speculative mappings we've established.
+    for (Type *Ty : SpeculativeTypes)
+      MappedTypes.erase(Ty);
+
+    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
+                                   SpeculativeDstOpaqueTypes.size());
+    for (StructType *Ty : SpeculativeDstOpaqueTypes)
+      DstResolvedOpaqueTypes.erase(Ty);
+  } else {
+    for (Type *Ty : SpeculativeTypes)
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        if (STy->hasName())
+          STy->setName("");
+  }
+  SpeculativeTypes.clear();
+  SpeculativeDstOpaqueTypes.clear();
+}
+
+/// Recursively walk this pair of types, returning true if they are isomorphic,
+/// false if they are not.
+bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
+  // Two types with differing kinds are clearly not isomorphic.
+  if (DstTy->getTypeID() != SrcTy->getTypeID())
+    return false;
+
+  // If we have an entry in the MappedTypes table, then we have our answer.
+  Type *&Entry = MappedTypes[SrcTy];
+  if (Entry)
+    return Entry == DstTy;
+
+  // Two identical types are clearly isomorphic.  Remember this
+  // non-speculatively.
+  if (DstTy == SrcTy) {
+    Entry = DstTy;
+    return true;
+  }
+
+  // Okay, we have two types with identical kinds that we haven't seen before.
+
+  // If this is an opaque struct type, special case it.
+  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
+    // Mapping an opaque type to any struct, just keep the dest struct.
+    if (SSTy->isOpaque()) {
+      Entry = DstTy;
+      SpeculativeTypes.push_back(SrcTy);
+      return true;
+    }
+
+    // Mapping a non-opaque source type to an opaque dest.  If this is the first
+    // type that we're mapping onto this destination type then we succeed.  Keep
+    // the dest, but fill it in later. If this is the second (different) type
+    // that we're trying to map onto the same opaque type then we fail.
+    if (cast<StructType>(DstTy)->isOpaque()) {
+      // We can only map one source type onto the opaque destination type.
+      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
+        return false;
+      SrcDefinitionsToResolve.push_back(SSTy);
+      SpeculativeTypes.push_back(SrcTy);
+      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
+      Entry = DstTy;
+      return true;
+    }
+  }
+
+  // If the number of subtypes disagree between the two types, then we fail.
+  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
+    return false;
+
+  // Fail if any of the extra properties (e.g. array size) of the type disagree.
+  if (isa<IntegerType>(DstTy))
+    return false; // bitwidth disagrees.
+  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
+    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
+      return false;
+
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
+    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
+      return false;
+  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
+    StructType *SSTy = cast<StructType>(SrcTy);
+    if (DSTy->isLiteral() != SSTy->isLiteral() ||
+        DSTy->isPacked() != SSTy->isPacked())
+      return false;
+  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
+    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+      return false;
+  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
+      return false;
+  }
+
+  // Otherwise, we speculate that these two types will line up and recursively
+  // check the subelements.
+  Entry = DstTy;
+  SpeculativeTypes.push_back(SrcTy);
+
+  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
+    if (!areTypesIsomorphic(DstTy->getContainedType(I),
+                            SrcTy->getContainedType(I)))
+      return false;
+
+  // If everything seems to have lined up, then everything is great.
+  return true;
+}
+
+void TypeMapTy::linkDefinedTypeBodies() {
+  SmallVector<Type *, 16> Elements;
+  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
+    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
+    assert(DstSTy->isOpaque());
+
+    // Map the body of the source type over to a new body for the dest type.
+    Elements.resize(SrcSTy->getNumElements());
+    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
+      Elements[I] = get(SrcSTy->getElementType(I));
+
+    DstSTy->setBody(Elements, SrcSTy->isPacked());
+    DstStructTypesSet.switchToNonOpaque(DstSTy);
+  }
+  SrcDefinitionsToResolve.clear();
+  DstResolvedOpaqueTypes.clear();
+}
+
+void TypeMapTy::finishType(StructType *DTy, StructType *STy,
+                           ArrayRef<Type *> ETypes) {
+  DTy->setBody(ETypes, STy->isPacked());
+
+  // Steal STy's name.
+  if (STy->hasName()) {
+    SmallString<16> TmpName = STy->getName();
+    STy->setName("");
+    DTy->setName(TmpName);
+  }
+
+  DstStructTypesSet.addNonOpaque(DTy);
+}
+
+Type *TypeMapTy::get(Type *Ty) {
+  SmallPtrSet<StructType *, 8> Visited;
+  return get(Ty, Visited);
+}
+
+Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
+  // If we already have an entry for this type, return it.
+  Type **Entry = &MappedTypes[Ty];
+  if (*Entry)
+    return *Entry;
+
+  // These are types that LLVM itself will unique.
+  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
+
+#ifndef NDEBUG
+  if (!IsUniqued) {
+    for (auto &Pair : MappedTypes) {
+      assert(!(Pair.first != Ty && Pair.second == Ty) &&
+             "mapping to a source type");
+    }
+  }
+#endif
+
+  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
+    StructType *DTy = StructType::create(Ty->getContext());
+    return *Entry = DTy;
+  }
+
+  // If this is not a recursive type, then just map all of the elements and
+  // then rebuild the type from inside out.
+  SmallVector<Type *, 4> ElementTypes;
+
+  // If there are no element types to map, then the type is itself.  This is
+  // true for the anonymous {} struct, things like 'float', integers, etc.
+  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
+    return *Entry = Ty;
+
+  // Remap all of the elements, keeping track of whether any of them change.
+  bool AnyChange = false;
+  ElementTypes.resize(Ty->getNumContainedTypes());
+  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
+    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
+    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
+  }
+
+  // If we found our type while recursively processing stuff, just use it.
+  Entry = &MappedTypes[Ty];
+  if (*Entry) {
+    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
+      if (DTy->isOpaque()) {
+        auto *STy = cast<StructType>(Ty);
+        finishType(DTy, STy, ElementTypes);
+      }
+    }
+    return *Entry;
+  }
+
+  // If all of the element types mapped directly over and the type is not
+  // a nomed struct, then the type is usable as-is.
+  if (!AnyChange && IsUniqued)
+    return *Entry = Ty;
+
+  // Otherwise, rebuild a modified type.
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unknown derived type to remap");
+  case Type::ArrayTyID:
+    return *Entry = ArrayType::get(ElementTypes[0],
+                                   cast<ArrayType>(Ty)->getNumElements());
+  case Type::VectorTyID:
+    return *Entry = VectorType::get(ElementTypes[0],
+                                    cast<VectorType>(Ty)->getNumElements());
+  case Type::PointerTyID:
+    return *Entry = PointerType::get(ElementTypes[0],
+                                     cast<PointerType>(Ty)->getAddressSpace());
+  case Type::FunctionTyID:
+    return *Entry = FunctionType::get(ElementTypes[0],
+                                      makeArrayRef(ElementTypes).slice(1),
+                                      cast<FunctionType>(Ty)->isVarArg());
+  case Type::StructTyID: {
+    auto *STy = cast<StructType>(Ty);
+    bool IsPacked = STy->isPacked();
+    if (IsUniqued)
+      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
+
+    // If the type is opaque, we can just use it directly.
+    if (STy->isOpaque()) {
+      DstStructTypesSet.addOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    if (StructType *OldT =
+            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
+      STy->setName("");
+      return *Entry = OldT;
+    }
+
+    if (!AnyChange) {
+      DstStructTypesSet.addNonOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    StructType *DTy = StructType::create(Ty->getContext());
+    finishType(DTy, STy, ElementTypes);
+    return *Entry = DTy;
+  }
+  }
+}
+
+LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
+                                       const Twine &Msg)
+    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
+void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+
+//===----------------------------------------------------------------------===//
+// ModuleLinker implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IRLinker;
+
+/// Creates prototypes for functions that are lazily linked on the fly. This
+/// speeds up linking for modules with many/ lazily linked functions of which
+/// few get used.
+class GlobalValueMaterializer final : public ValueMaterializer {
+  IRLinker *ModLinker;
+
+public:
+  GlobalValueMaterializer(IRLinker *ModLinker) : ModLinker(ModLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+};
+
+class LocalValueMaterializer final : public ValueMaterializer {
+  IRLinker *ModLinker;
+
+public:
+  LocalValueMaterializer(IRLinker *ModLinker) : ModLinker(ModLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+};
+
+/// This is responsible for keeping track of the state used for moving data
+/// from SrcM to DstM.
+class IRLinker {
+  Module &DstM;
+  Module &SrcM;
+
+  std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;
+
+  TypeMapTy TypeMap;
+  GlobalValueMaterializer GValMaterializer;
+  LocalValueMaterializer LValMaterializer;
+
+  /// Mapping of values from what they used to be in Src, to what they are now
+  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
+  /// due to the use of Value handles which the Linker doesn't actually need,
+  /// but this allows us to reuse the ValueMapper code.
+  ValueToValueMapTy ValueMap;
+  ValueToValueMapTy AliasValueMap;
+
+  DenseSet<GlobalValue *> ValuesToLink;
+  std::vector<GlobalValue *> Worklist;
+
+  void maybeAdd(GlobalValue *GV) {
+    if (ValuesToLink.insert(GV).second)
+      Worklist.push_back(GV);
+  }
+
+  DiagnosticHandlerFunction DiagnosticHandler;
+
+  /// Set to true when all global value body linking is complete (including
+  /// lazy linking). Used to prevent metadata linking from creating new
+  /// references.
+  bool DoneLinkingBodies = false;
+
+  bool HasError = false;
+
+  /// Handles cloning of a global values from the source module into
+  /// the destination module, including setting the attributes and visibility.
+  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
+
+  /// Helper method for setting a message and returning an error code.
+  bool emitError(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
+    return true;
+  }
+
+  void emitWarning(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
+  }
+
+  /// Given a global in the source module, return the global in the
+  /// destination module that is being linked to, if any.
+  GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    // If the source has no name it can't link.  If it has local linkage,
+    // there is no name match-up going on.
+    if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise see if we have a match in the destination module's symtab.
+    GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName());
+    if (!DGV)
+      return nullptr;
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise, we do in fact link to the destination global.
+    return DGV;
+  }
+
+  void computeTypeMapping();
+
+  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
+                                  const GlobalVariable *SrcGV);
+
+  bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
+  Constant *linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
+
+  bool linkModuleFlagsMetadata();
+
+  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
+  bool linkFunctionBody(Function &Dst, Function &Src);
+  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
+  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
+
+  /// Functions that take care of cloning a specific global value type
+  /// into the destination module.
+  GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
+  Function *copyFunctionProto(const Function *SF);
+  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
+
+  void linkNamedMDNodes();
+
+public:
+  IRLinker(Module &DstM, IRMover::IdentifiedStructTypeSet &Set, Module &SrcM,
+           DiagnosticHandlerFunction DiagnosticHandler,
+           ArrayRef<GlobalValue *> ValuesToLink,
+           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor)
+      : DstM(DstM), SrcM(SrcM), AddLazyFor(AddLazyFor), TypeMap(Set),
+        GValMaterializer(this), LValMaterializer(this),
+        DiagnosticHandler(DiagnosticHandler) {
+    for (GlobalValue *GV : ValuesToLink)
+      maybeAdd(GV);
+  }
+
+  bool run();
+  Value *materializeDeclFor(Value *V, bool ForAlias);
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
+};
+}
+
+/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
+/// table. This is good for all clients except for us. Go through the trouble
+/// to force this back.
+static void forceRenaming(GlobalValue *GV, StringRef Name) {
+  // If the global doesn't force its name or if it already has the right name,
+  // there is nothing for us to do.
+  if (GV->hasLocalLinkage() || GV->getName() == Name)
+    return;
+
+  Module *M = GV->getParent();
+
+  // If there is a conflict, rename the conflict.
+  if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
+    GV->takeName(ConflictGV);
+    ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
+    assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
+  } else {
+    GV->setName(Name); // Force the name back
+  }
+}
+
+Value *GlobalValueMaterializer::materializeDeclFor(Value *V) {
+  return ModLinker->materializeDeclFor(V, false);
+}
+
+void GlobalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                 GlobalValue *Old) {
+  ModLinker->materializeInitFor(New, Old, false);
+}
+
+Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
+  return ModLinker->materializeDeclFor(V, true);
+}
+
+void LocalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                GlobalValue *Old) {
+  ModLinker->materializeInitFor(New, Old, true);
+}
+
+Value *IRLinker::materializeDeclFor(Value *V, bool ForAlias) {
+  auto *SGV = dyn_cast<GlobalValue>(V);
+  if (!SGV)
+    return nullptr;
+
+  return linkGlobalValueProto(SGV, ForAlias);
+}
+
+void IRLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old,
+                                  bool ForAlias) {
+  // If we already created the body, just return.
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (ForAlias || shouldLink(New, *Old))
+    linkGlobalValueBody(*New, *Old);
+}
+
+/// Loop through the global variables in the src module and merge them into the
+/// dest module.
+GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
+  // No linking to be performed or linking from the source: simply create an
+  // identical version of the symbol over in the dest module... the
+  // initializer will be filled in later by LinkGlobalInits.
+  GlobalVariable *NewDGV =
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
+                         /*init*/ nullptr, SGVar->getName(),
+                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+                         SGVar->getType()->getAddressSpace());
+  NewDGV->setAlignment(SGVar->getAlignment());
+  return NewDGV;
+}
+
+/// Link the function in the source module into the destination module if
+/// needed, setting up mapping information.
+Function *IRLinker::copyFunctionProto(const Function *SF) {
+  // If there is no linkage to be performed or we are linking from the source,
+  // bring SF over.
+  return Function::Create(TypeMap.get(SF->getFunctionType()),
+                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+}
+
+/// Set up prototypes for any aliases that come over from the source module.
+GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
+  // If there is no linkage to be performed or we're linking from the source,
+  // bring over SGA.
+  auto *Ty = TypeMap.get(SGA->getValueType());
+  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                             GlobalValue::ExternalLinkage, SGA->getName(),
+                             &DstM);
+}
+
+GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
+                                            bool ForDefinition) {
+  GlobalValue *NewGV;
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
+    NewGV = copyGlobalVariableProto(SGVar);
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
+    NewGV = copyFunctionProto(SF);
+  } else {
+    if (ForDefinition)
+      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
+    else
+      NewGV = new GlobalVariable(
+          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          /*isConstant*/ false, GlobalValue::ExternalLinkage,
+          /*init*/ nullptr, SGV->getName(),
+          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
+          SGV->getType()->getAddressSpace());
+  }
+
+  if (ForDefinition)
+    NewGV->setLinkage(SGV->getLinkage());
+  else if (SGV->hasExternalWeakLinkage() || SGV->hasWeakLinkage() ||
+           SGV->hasLinkOnceLinkage())
+    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+  NewGV->copyAttributesFrom(SGV);
+  return NewGV;
+}
+
+/// Loop over all of the linked values to compute type mappings.  For example,
+/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
+/// types 'Foo' but one got renamed when the module was loaded into the same
+/// LLVMContext.
+void IRLinker::computeTypeMapping() {
+  for (GlobalValue &SGV : SrcM.globals()) {
+    GlobalValue *DGV = getLinkedToGlobal(&SGV);
+    if (!DGV)
+      continue;
+
+    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+      continue;
+    }
+
+    // Unify the element type of appending arrays.
+    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
+    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
+    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
+  }
+
+  for (GlobalValue &SGV : SrcM)
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  for (GlobalValue &SGV : SrcM.aliases())
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  // Incorporate types by name, scanning all the types in the source module.
+  // At this point, the destination module may have a type "%foo = { i32 }" for
+  // example.  When the source module got loaded into the same LLVMContext, if
+  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
+  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
+  for (StructType *ST : Types) {
+    if (!ST->hasName())
+      continue;
+
+    // Check to see if there is a dot in the name followed by a digit.
+    size_t DotPos = ST->getName().rfind('.');
+    if (DotPos == 0 || DotPos == StringRef::npos ||
+        ST->getName().back() == '.' ||
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
+      continue;
+
+    // Check to see if the destination module has a struct with the prefix name.
+    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
+    if (!DST)
+      continue;
+
+    // Don't use it if this actually came from the source module. They're in
+    // the same LLVMContext after all. Also don't use it unless the type is
+    // actually used in the destination module. This can happen in situations
+    // like this:
+    //
+    //      Module A                         Module B
+    //      --------                         --------
+    //   %Z = type { %A }                %B = type { %C.1 }
+    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+    //   %C = type { i8* }               %B.3 = type { %C.1 }
+    //
+    // When we link Module B with Module A, the '%B' in Module B is
+    // used. However, that would then use '%C.1'. But when we process '%C.1',
+    // we prefer to take the '%C' version. So we are then left with both
+    // '%C.1' and '%C' being used for the same types. This leads to some
+    // variables using one type and some using the other.
+    if (TypeMap.DstStructTypesSet.hasType(DST))
+      TypeMap.addTypeMapping(DST, ST);
+  }
+
+  // Now that we have discovered all of the type equivalences, get a body for
+  // any 'opaque' types in the dest module that are now resolved.
+  TypeMap.linkDefinedTypeBodies();
+}
+
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+
+/// If there were any appending global variables, link them together now.
+/// Return true on error.
+Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                          const GlobalVariable *SrcGV) {
+  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
+                    ->getElementType();
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor = false;
+  bool IsOldStructor = false;
+  if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") {
+    if (cast<StructType>(EltTy)->getNumElements() == 3)
+      IsNewStructor = true;
+    else
+      IsOldStructor = true;
+  }
+
+  PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
+  if (IsOldStructor) {
+    auto &ST = *cast<StructType>(EltTy);
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(SrcGV->getContext(), Tys, false);
+  }
+
+  if (DstGV) {
+    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
+      emitError(
+          "Linking globals named '" + SrcGV->getName() +
+          "': can only link appending global with another appending global!");
+      return nullptr;
+    }
+
+    // Check to see that they two arrays agree on type.
+    if (EltTy != DstTy->getElementType()) {
+      emitError("Appending variables with different element types!");
+      return nullptr;
+    }
+    if (DstGV->isConstant() != SrcGV->isConstant()) {
+      emitError("Appending variables linked with different const'ness!");
+      return nullptr;
+    }
+
+    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
+      emitError(
+          "Appending variables with different alignment need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
+      emitError(
+          "Appending variables with different visibility need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
+      emitError(
+          "Appending variables with different unnamed_addr need to be linked!");
+      return nullptr;
+    }
+
+    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
+      emitError(
+          "Appending variables with different section name need to be linked!");
+      return nullptr;
+    }
+  }
+
+  SmallVector<Constant *, 16> DstElements;
+  if (DstGV)
+    getArrayElements(DstGV->getInitializer(), DstElements);
+
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(SrcGV->getInitializer(), SrcElements);
+
+  if (IsNewStructor)
+    SrcElements.erase(
+        std::remove_if(SrcElements.begin(), SrcElements.end(),
+                       [this](Constant *E) {
+                         auto *Key = dyn_cast<GlobalValue>(
+                             E->getAggregateElement(2)->stripPointerCasts());
+                         if (!Key)
+                           return false;
+                         GlobalValue *DGV = getLinkedToGlobal(Key);
+                         return !shouldLink(DGV, *Key);
+                       }),
+        SrcElements.end());
+  uint64_t NewSize = DstElements.size() + SrcElements.size();
+  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
+
+  // Create the new global variable.
+  GlobalVariable *NG = new GlobalVariable(
+      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
+      /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
+      SrcGV->getType()->getAddressSpace());
+
+  NG->copyAttributesFrom(SrcGV);
+  forceRenaming(NG, SrcGV->getName());
+
+  Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+
+  // Stop recursion.
+  ValueMap[SrcGV] = Ret;
+
+  for (auto *V : SrcElements) {
+    Constant *NewV;
+    if (IsOldStructor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = MapValue(S->getOperand(0), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &GValMaterializer);
+      auto *E2 = MapValue(S->getOperand(1), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &GValMaterializer);
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV = MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap,
+                      &GValMaterializer);
+    }
+    DstElements.push_back(NewV);
+  }
+
+  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+
+  // Replace any uses of the two global variables with uses of the new
+  // global.
+  if (DstGV) {
+    DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
+    DstGV->eraseFromParent();
+  }
+
+  return Ret;
+}
+
+static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV,
+                            bool ShouldLink) {
+  if (!DGV)
+    return false;
+
+  if (SGV.isDeclaration())
+    return true;
+
+  if (DGV->isDeclarationForLinker() && !SGV.isDeclarationForLinker())
+    return false;
+
+  if (ShouldLink)
+    return false;
+
+  return true;
+}
+
+bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
+  if (ValuesToLink.count(&SGV))
+    return true;
+
+  if (SGV.hasLocalLinkage())
+    return true;
+
+  if (DGV && !DGV->isDeclaration())
+    return false;
+
+  if (SGV.hasAvailableExternallyLinkage())
+    return true;
+
+  if (DoneLinkingBodies)
+    return false;
+
+  AddLazyFor(SGV, [this](GlobalValue &GV) { maybeAdd(&GV); });
+  return ValuesToLink.count(&SGV);
+}
+
+Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
+  GlobalValue *DGV = getLinkedToGlobal(SGV);
+
+  bool ShouldLink = shouldLink(DGV, *SGV);
+
+  // just missing from map
+  if (ShouldLink) {
+    auto I = ValueMap.find(SGV);
+    if (I != ValueMap.end())
+      return cast<Constant>(I->second);
+
+    I = AliasValueMap.find(SGV);
+    if (I != AliasValueMap.end())
+      return cast<Constant>(I->second);
+  }
+
+  DGV = nullptr;
+  if (ShouldLink || !ForAlias)
+    DGV = getLinkedToGlobal(SGV);
+
+  // Handle the ultra special appending linkage case first.
+  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
+  if (SGV->hasAppendingLinkage())
+    return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
+                                 cast<GlobalVariable>(SGV));
+
+  GlobalValue *NewGV;
+  if (useExistingDest(*SGV, DGV, ShouldLink)) {
+    NewGV = DGV;
+  } else {
+    // If we are done linking global value bodies (i.e. we are performing
+    // metadata linking), don't link in the global value due to this
+    // reference, simply map it to null.
+    if (DoneLinkingBodies)
+      return nullptr;
+
+    NewGV = copyGlobalValueProto(SGV, ShouldLink);
+    if (!ForAlias)
+      forceRenaming(NewGV, SGV->getName());
+  }
+  if (ShouldLink || ForAlias) {
+    if (const Comdat *SC = SGV->getComdat()) {
+      if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
+        Comdat *DC = DstM.getOrInsertComdat(SC->getName());
+        DC->setSelectionKind(SC->getSelectionKind());
+        GO->setComdat(DC);
+      }
+    }
+  }
+
+  if (!ShouldLink && ForAlias)
+    NewGV->setLinkage(GlobalValue::InternalLinkage);
+
+  Constant *C = NewGV;
+  if (DGV)
+    C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
+
+  if (DGV && NewGV != DGV) {
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
+    DGV->eraseFromParent();
+  }
+
+  return C;
+}
+
+/// Update the initializers in the Dest module now that all globals that may be
+/// referenced are in Dest.
+void IRLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
+  // Figure out what the initializer looks like in the dest module.
+  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap,
+                              RF_MoveDistinctMDs, &TypeMap, &GValMaterializer));
+}
+
+/// Copy the source function over into the dest function and fix up references
+/// to values. At this point we know that Dest is an external function, and
+/// that Src is not.
+bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
+  assert(Dst.isDeclaration() && !Src.isDeclaration());
+
+  // Materialize if needed.
+  if (std::error_code EC = Src.materialize())
+    return emitError(EC.message());
+
+  // Link in the prefix data.
+  if (Src.hasPrefixData())
+    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap,
+                               RF_MoveDistinctMDs, &TypeMap,
+                               &GValMaterializer));
+
+  // Link in the prologue data.
+  if (Src.hasPrologueData())
+    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
+                                 RF_MoveDistinctMDs, &TypeMap,
+                                 &GValMaterializer));
+
+  // Link in the personality function.
+  if (Src.hasPersonalityFn())
+    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
+                                  RF_MoveDistinctMDs, &TypeMap,
+                                  &GValMaterializer));
+
+  // Go through and convert function arguments over, remembering the mapping.
+  Function::arg_iterator DI = Dst.arg_begin();
+  for (Argument &Arg : Src.args()) {
+    DI->setName(Arg.getName()); // Copy the name over.
+
+    // Add a mapping to our mapping.
+    ValueMap[&Arg] = &*DI;
+    ++DI;
+  }
+
+  // Copy over the metadata attachments.
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  Src.getAllMetadata(MDs);
+  for (const auto &I : MDs)
+    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_MoveDistinctMDs,
+                                         &TypeMap, &GValMaterializer));
+
+  // Splice the body of the source function into the dest function.
+  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
+
+  // At this point, all of the instructions and values of the function are now
+  // copied over.  The only problem is that they are still referencing values in
+  // the Source function as operands.  Loop through all of the operands of the
+  // functions and patch them up to point to the local versions.
+  for (BasicBlock &BB : Dst)
+    for (Instruction &I : BB)
+      RemapInstruction(&I, ValueMap,
+                       RF_IgnoreMissingEntries | RF_MoveDistinctMDs, &TypeMap,
+                       &GValMaterializer);
+
+  // There is no need to map the arguments anymore.
+  for (Argument &Arg : Src.args())
+    ValueMap.erase(&Arg);
+
+  Src.dematerialize();
+  return false;
+}
+
+void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
+  Constant *Aliasee = Src.getAliasee();
+  Constant *Val = MapValue(Aliasee, AliasValueMap, RF_MoveDistinctMDs, &TypeMap,
+                           &LValMaterializer);
+  Dst.setAliasee(Val);
+}
+
+bool IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
+  if (auto *F = dyn_cast<Function>(&Src))
+    return linkFunctionBody(cast<Function>(Dst), *F);
+  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
+    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
+    return false;
+  }
+  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
+  return false;
+}
+
+/// Insert all of the named MDNodes in Src into the Dest module.
+void IRLinker::linkNamedMDNodes() {
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
+    // Don't link module flags here. Do them separately.
+    if (&NMD == SrcModFlags)
+      continue;
+    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
+    // Add Src elements into Dest node.
+    for (const MDNode *op : NMD.operands())
+      DestNMD->addOperand(MapMetadata(
+          op, ValueMap, RF_MoveDistinctMDs | RF_NullMapMissingGlobalValues,
+          &TypeMap, &GValMaterializer));
+  }
+}
+
+/// Merge the linker flags in Src into the Dest module.
+bool IRLinker::linkModuleFlagsMetadata() {
+  // If the source module has no module flags, we are done.
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  if (!SrcModFlags)
+    return false;
+
+  // If the destination module doesn't have module flags yet, then just copy
+  // over the source module's flags.
+  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
+  if (DstModFlags->getNumOperands() == 0) {
+    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
+      DstModFlags->addOperand(SrcModFlags->getOperand(I));
+
+    return false;
+  }
+
+  // First build a map of the existing module flags and requirements.
+  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
+  SmallSetVector<MDNode *, 16> Requirements;
+  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = DstModFlags->getOperand(I);
+    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
+    MDString *ID = cast<MDString>(Op->getOperand(1));
+
+    if (Behavior->getZExtValue() == Module::Require) {
+      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
+    } else {
+      Flags[ID] = std::make_pair(Op, I);
+    }
+  }
+
+  // Merge in the flags from the source module, and also collect its set of
+  // requirements.
+  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *SrcOp = SrcModFlags->getOperand(I);
+    ConstantInt *SrcBehavior =
+        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
+    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
+    MDNode *DstOp;
+    unsigned DstIndex;
+    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
+    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+
+    // If this is a requirement, add it and continue.
+    if (SrcBehaviorValue == Module::Require) {
+      // If the destination module does not already have this requirement, add
+      // it.
+      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
+        DstModFlags->addOperand(SrcOp);
+      }
+      continue;
+    }
+
+    // If there is no existing flag with this ID, just add it.
+    if (!DstOp) {
+      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
+      DstModFlags->addOperand(SrcOp);
+      continue;
+    }
+
+    // Otherwise, perform a merge.
+    ConstantInt *DstBehavior =
+        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
+    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
+
+    // If either flag has override behavior, handle it first.
+    if (DstBehaviorValue == Module::Override) {
+      // Diagnose inconsistent flags which both have override behavior.
+      if (SrcBehaviorValue == Module::Override &&
+          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting override values");
+      }
+      continue;
+    } else if (SrcBehaviorValue == Module::Override) {
+      // Update the destination flag to that of the source.
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
+      continue;
+    }
+
+    // Diagnose inconsistent merge behavior types.
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      emitError("linking module flags '" + ID->getString() +
+                "': IDs have conflicting behaviors");
+      continue;
+    }
+
+    auto replaceDstValue = [&](MDNode *New) {
+      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+    };
+
+    // Perform the merge for standard behavior types.
+    switch (SrcBehaviorValue) {
+    case Module::Require:
+    case Module::Override:
+      llvm_unreachable("not possible");
+    case Module::Error: {
+      // Emit an error if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Warning: {
+      // Emit a warning if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitWarning("linking module flags '" + ID->getString() +
+                    "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Append: {
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      SmallVector<Metadata *, 8> MDs;
+      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
+      MDs.append(DstValue->op_begin(), DstValue->op_end());
+      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
+      break;
+    }
+    case Module::AppendUnique: {
+      SmallSetVector<Metadata *, 16> Elts;
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      Elts.insert(DstValue->op_begin(), DstValue->op_end());
+      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(),
+                                  makeArrayRef(Elts.begin(), Elts.end())));
+      break;
+    }
+    }
+  }
+
+  // Check all of the requirements.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Metadata *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = Flags[Flag].first;
+    if (!Op || Op->getOperand(2) != ReqValue) {
+      emitError("linking module flags '" + Flag->getString() +
+                "': does not have the required value");
+      continue;
+    }
+  }
+
+  return HasError;
+}
+
+// This function returns true if the triples match.
+static bool triplesMatch(const Triple &T0, const Triple &T1) {
+  // If vendor is apple, ignore the version number.
+  if (T0.getVendor() == Triple::Apple)
+    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
+
+  return T0 == T1;
+}
+
+// This function returns the merged triple.
+static std::string mergeTriples(const Triple &SrcTriple,
+                                const Triple &DstTriple) {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (SrcTriple.getVendor() == Triple::Apple)
+    if (DstTriple.isOSVersionLT(SrcTriple))
+      return SrcTriple.str();
+
+  return DstTriple.str();
+}
+
+bool IRLinker::run() {
+  // Inherit the target data from the source module if the destination module
+  // doesn't have one already.
+  if (DstM.getDataLayout().isDefault())
+    DstM.setDataLayout(SrcM.getDataLayout());
+
+  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
+    emitWarning("Linking two modules of different data layouts: '" +
+                SrcM.getModuleIdentifier() + "' is '" +
+                SrcM.getDataLayoutStr() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" +
+                DstM.getDataLayoutStr() + "'\n");
+  }
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM.getTargetTriple());
+
+  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+
+  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+    emitWarning("Linking two modules of different target triples: " +
+                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
+                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
+                DstM.getTargetTriple() + "'\n");
+
+  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+
+  // Append the module inline asm string.
+  if (!SrcM.getModuleInlineAsm().empty()) {
+    if (DstM.getModuleInlineAsm().empty())
+      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
+    else
+      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
+                              SrcM.getModuleInlineAsm());
+  }
+
+  // Loop over all of the linked values to compute type mappings.
+  computeTypeMapping();
+
+  std::reverse(Worklist.begin(), Worklist.end());
+  while (!Worklist.empty()) {
+    GlobalValue *GV = Worklist.back();
+    Worklist.pop_back();
+
+    // Already mapped.
+    if (ValueMap.find(GV) != ValueMap.end() ||
+        AliasValueMap.find(GV) != AliasValueMap.end())
+      continue;
+
+    assert(!GV->isDeclaration());
+    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &GValMaterializer);
+    if (HasError)
+      return true;
+  }
+
+  // Note that we are done linking global value bodies. This prevents
+  // metadata linking from creating new references.
+  DoneLinkingBodies = true;
+
+  // Remap all of the named MDNodes in Src into the DstM module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  linkNamedMDNodes();
+
+  // Merge the module flags into the DstM module.
+  if (linkModuleFlagsMetadata())
+    return true;
+
+  return false;
+}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
+    : ETypes(E), IsPacked(P) {}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
+    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
+  if (IsPacked != That.IsPacked)
+    return false;
+  if (ETypes != That.ETypes)
+    return false;
+  return true;
+}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
+  return !this->operator==(That);
+}
+
+StructType *IRMover::StructTypeKeyInfo::getEmptyKey() {
+  return DenseMapInfo<StructType *>::getEmptyKey();
+}
+
+StructType *IRMover::StructTypeKeyInfo::getTombstoneKey() {
+  return DenseMapInfo<StructType *>::getTombstoneKey();
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
+  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
+                      Key.IsPacked);
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const StructType *ST) {
+  return getHashValue(KeyTy(ST));
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+    return false;
+  return LHS == KeyTy(RHS);
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey())
+    return LHS == getEmptyKey();
+
+  if (RHS == getTombstoneKey())
+    return LHS == getTombstoneKey();
+
+  return KeyTy(LHS) == KeyTy(RHS);
+}
+
+void IRMover::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+}
+
+void IRMover::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+  bool Removed = OpaqueStructTypes.erase(Ty);
+  (void)Removed;
+  assert(Removed);
+}
+
+void IRMover::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
+  assert(Ty->isOpaque());
+  OpaqueStructTypes.insert(Ty);
+}
+
+StructType *
+IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
+                                                bool IsPacked) {
+  IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
+  auto I = NonOpaqueStructTypes.find_as(Key);
+  if (I == NonOpaqueStructTypes.end())
+    return nullptr;
+  return *I;
+}
+
+bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) {
+  if (Ty->isOpaque())
+    return OpaqueStructTypes.count(Ty);
+  auto I = NonOpaqueStructTypes.find(Ty);
+  if (I == NonOpaqueStructTypes.end())
+    return false;
+  return *I == Ty;
+}
+
+IRMover::IRMover(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
+    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, true);
+  for (StructType *Ty : StructTypes) {
+    if (Ty->isOpaque())
+      IdentifiedStructTypes.addOpaque(Ty);
+    else
+      IdentifiedStructTypes.addNonOpaque(Ty);
+  }
+}
+
+bool IRMover::move(
+    Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
+    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor) {
+  IRLinker TheLinker(Composite, IdentifiedStructTypes, Src, DiagnosticHandler,
+                     ValuesToLink, AddLazyFor);
+  bool RetCode = TheLinker.run();
+  Composite.dropTriviallyDeadConstantArrays();
+  return RetCode;
+}
diff --git a/lib/Linker/LinkDiagnosticInfo.h b/lib/Linker/LinkDiagnosticInfo.h
new file mode 100644
index 000000000000..d91f19c69aac
--- /dev/null
+++ b/lib/Linker/LinkDiagnosticInfo.h
@@ -0,0 +1,25 @@
+//===- LinkDiagnosticInfo.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+#define LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+
+namespace llvm {
+class LinkDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
+  void print(DiagnosticPrinter &DP) const override;
+};
+}
+
+#endif
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 67613967f490..a596697e8f51 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -12,389 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Linker/Linker.h"
+#include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeFinder.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-// TypeMap implementation.
-//===----------------------------------------------------------------------===//
-
 namespace {
-class TypeMapTy : public ValueMapTypeRemapper {
-  /// This is a mapping from a source type to a destination type to use.
-  DenseMap<Type *, Type *> MappedTypes;
-
-  /// When checking to see if two subgraphs are isomorphic, we speculatively
-  /// add types to MappedTypes, but keep track of them here in case we need to
-  /// roll back.
-  SmallVector<Type *, 16> SpeculativeTypes;
-
-  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
-
-  /// This is a list of non-opaque structs in the source module that are mapped
-  /// to an opaque struct in the destination module.
-  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
-
-  /// This is the set of opaque types in the destination modules who are
-  /// getting a body from the source module.
-  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
-
-public:
-  TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
-      : DstStructTypesSet(DstStructTypesSet) {}
-
-  Linker::IdentifiedStructTypeSet &DstStructTypesSet;
-  /// Indicate that the specified type in the destination module is conceptually
-  /// equivalent to the specified type in the source module.
-  void addTypeMapping(Type *DstTy, Type *SrcTy);
-
-  /// Produce a body for an opaque type in the dest module from a type
-  /// definition in the source module.
-  void linkDefinedTypeBodies();
-
-  /// Return the mapped type to use for the specified input type from the
-  /// source module.
-  Type *get(Type *SrcTy);
-  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
-
-  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
-
-  FunctionType *get(FunctionType *T) {
-    return cast<FunctionType>(get((Type *)T));
-  }
-
-  /// Dump out the type map for debugging purposes.
-  void dump() const {
-    for (auto &Pair : MappedTypes) {
-      dbgs() << "TypeMap: ";
-      Pair.first->print(dbgs());
-      dbgs() << " => ";
-      Pair.second->print(dbgs());
-      dbgs() << '\n';
-    }
-  }
-
-private:
-  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
-
-  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
-};
-}
-
-void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
-  assert(SpeculativeTypes.empty());
-  assert(SpeculativeDstOpaqueTypes.empty());
-
-  // Check to see if these types are recursively isomorphic and establish a
-  // mapping between them if so.
-  if (!areTypesIsomorphic(DstTy, SrcTy)) {
-    // Oops, they aren't isomorphic.  Just discard this request by rolling out
-    // any speculative mappings we've established.
-    for (Type *Ty : SpeculativeTypes)
-      MappedTypes.erase(Ty);
-
-    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
-                                   SpeculativeDstOpaqueTypes.size());
-    for (StructType *Ty : SpeculativeDstOpaqueTypes)
-      DstResolvedOpaqueTypes.erase(Ty);
-  } else {
-    for (Type *Ty : SpeculativeTypes)
-      if (auto *STy = dyn_cast<StructType>(Ty))
-        if (STy->hasName())
-          STy->setName("");
-  }
-  SpeculativeTypes.clear();
-  SpeculativeDstOpaqueTypes.clear();
-}
-
-/// Recursively walk this pair of types, returning true if they are isomorphic,
-/// false if they are not.
-bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
-  // Two types with differing kinds are clearly not isomorphic.
-  if (DstTy->getTypeID() != SrcTy->getTypeID())
-    return false;
-
-  // If we have an entry in the MappedTypes table, then we have our answer.
-  Type *&Entry = MappedTypes[SrcTy];
-  if (Entry)
-    return Entry == DstTy;
-
-  // Two identical types are clearly isomorphic.  Remember this
-  // non-speculatively.
-  if (DstTy == SrcTy) {
-    Entry = DstTy;
-    return true;
-  }
-
-  // Okay, we have two types with identical kinds that we haven't seen before.
-
-  // If this is an opaque struct type, special case it.
-  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
-    // Mapping an opaque type to any struct, just keep the dest struct.
-    if (SSTy->isOpaque()) {
-      Entry = DstTy;
-      SpeculativeTypes.push_back(SrcTy);
-      return true;
-    }
-
-    // Mapping a non-opaque source type to an opaque dest.  If this is the first
-    // type that we're mapping onto this destination type then we succeed.  Keep
-    // the dest, but fill it in later. If this is the second (different) type
-    // that we're trying to map onto the same opaque type then we fail.
-    if (cast<StructType>(DstTy)->isOpaque()) {
-      // We can only map one source type onto the opaque destination type.
-      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
-        return false;
-      SrcDefinitionsToResolve.push_back(SSTy);
-      SpeculativeTypes.push_back(SrcTy);
-      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
-      Entry = DstTy;
-      return true;
-    }
-  }
-
-  // If the number of subtypes disagree between the two types, then we fail.
-  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
-    return false;
-
-  // Fail if any of the extra properties (e.g. array size) of the type disagree.
-  if (isa<IntegerType>(DstTy))
-    return false; // bitwidth disagrees.
-  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
-    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
-      return false;
-
-  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
-    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
-      return false;
-  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
-    StructType *SSTy = cast<StructType>(SrcTy);
-    if (DSTy->isLiteral() != SSTy->isLiteral() ||
-        DSTy->isPacked() != SSTy->isPacked())
-      return false;
-  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
-    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
-      return false;
-  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
-      return false;
-  }
-
-  // Otherwise, we speculate that these two types will line up and recursively
-  // check the subelements.
-  Entry = DstTy;
-  SpeculativeTypes.push_back(SrcTy);
-
-  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
-    if (!areTypesIsomorphic(DstTy->getContainedType(I),
-                            SrcTy->getContainedType(I)))
-      return false;
-
-  // If everything seems to have lined up, then everything is great.
-  return true;
-}
-
-void TypeMapTy::linkDefinedTypeBodies() {
-  SmallVector<Type *, 16> Elements;
-  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
-    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
-    assert(DstSTy->isOpaque());
-
-    // Map the body of the source type over to a new body for the dest type.
-    Elements.resize(SrcSTy->getNumElements());
-    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
-      Elements[I] = get(SrcSTy->getElementType(I));
-
-    DstSTy->setBody(Elements, SrcSTy->isPacked());
-    DstStructTypesSet.switchToNonOpaque(DstSTy);
-  }
-  SrcDefinitionsToResolve.clear();
-  DstResolvedOpaqueTypes.clear();
-}
-
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-                           ArrayRef<Type *> ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-    SmallString<16> TmpName = STy->getName();
-    STy->setName("");
-    DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
-Type *TypeMapTy::get(Type *Ty) {
-  SmallPtrSet<StructType *, 8> Visited;
-  return get(Ty, Visited);
-}
-
-Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
-  // If we already have an entry for this type, return it.
-  Type **Entry = &MappedTypes[Ty];
-  if (*Entry)
-    return *Entry;
-
-  // These are types that LLVM itself will unique.
-  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
-
-#ifndef NDEBUG
-  if (!IsUniqued) {
-    for (auto &Pair : MappedTypes) {
-      assert(!(Pair.first != Ty && Pair.second == Ty) &&
-             "mapping to a source type");
-    }
-  }
-#endif
-
-  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
-    StructType *DTy = StructType::create(Ty->getContext());
-    return *Entry = DTy;
-  }
-
-  // If this is not a recursive type, then just map all of the elements and
-  // then rebuild the type from inside out.
-  SmallVector<Type *, 4> ElementTypes;
-
-  // If there are no element types to map, then the type is itself.  This is
-  // true for the anonymous {} struct, things like 'float', integers, etc.
-  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
-    return *Entry = Ty;
-
-  // Remap all of the elements, keeping track of whether any of them change.
-  bool AnyChange = false;
-  ElementTypes.resize(Ty->getNumContainedTypes());
-  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
-    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
-    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
-  }
-
-  // If we found our type while recursively processing stuff, just use it.
-  Entry = &MappedTypes[Ty];
-  if (*Entry) {
-    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
-      if (DTy->isOpaque()) {
-        auto *STy = cast<StructType>(Ty);
-        finishType(DTy, STy, ElementTypes);
-      }
-    }
-    return *Entry;
-  }
-
-  // If all of the element types mapped directly over and the type is not
-  // a nomed struct, then the type is usable as-is.
-  if (!AnyChange && IsUniqued)
-    return *Entry = Ty;
-
-  // Otherwise, rebuild a modified type.
-  switch (Ty->getTypeID()) {
-  default:
-    llvm_unreachable("unknown derived type to remap");
-  case Type::ArrayTyID:
-    return *Entry = ArrayType::get(ElementTypes[0],
-                                   cast<ArrayType>(Ty)->getNumElements());
-  case Type::VectorTyID:
-    return *Entry = VectorType::get(ElementTypes[0],
-                                    cast<VectorType>(Ty)->getNumElements());
-  case Type::PointerTyID:
-    return *Entry = PointerType::get(ElementTypes[0],
-                                     cast<PointerType>(Ty)->getAddressSpace());
-  case Type::FunctionTyID:
-    return *Entry = FunctionType::get(ElementTypes[0],
-                                      makeArrayRef(ElementTypes).slice(1),
-                                      cast<FunctionType>(Ty)->isVarArg());
-  case Type::StructTyID: {
-    auto *STy = cast<StructType>(Ty);
-    bool IsPacked = STy->isPacked();
-    if (IsUniqued)
-      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
-
-    // If the type is opaque, we can just use it directly.
-    if (STy->isOpaque()) {
-      DstStructTypesSet.addOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    if (StructType *OldT =
-            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
-      STy->setName("");
-      return *Entry = OldT;
-    }
-
-    if (!AnyChange) {
-      DstStructTypesSet.addNonOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    StructType *DTy = StructType::create(Ty->getContext());
-    finishType(DTy, STy, ElementTypes);
-    return *Entry = DTy;
-  }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// ModuleLinker implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ModuleLinker;
-
-/// Creates prototypes for functions that are lazily linked on the fly. This
-/// speeds up linking for modules with many/ lazily linked functions of which
-/// few get used.
-class ValueMaterializerTy final : public ValueMaterializer {
-  ModuleLinker *ModLinker;
-
-public:
-  ValueMaterializerTy(ModuleLinker *ModLinker) : ModLinker(ModLinker) {}
-
-  Value *materializeDeclFor(Value *V) override;
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
-};
-
-class LinkDiagnosticInfo : public DiagnosticInfo {
-  const Twine &Msg;
-
-public:
-  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
-  void print(DiagnosticPrinter &DP) const override;
-};
-LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
-                                       const Twine &Msg)
-    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
-void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
-  Module &DstM;
+  IRMover &Mover;
   Module &SrcM;
 
-  TypeMapTy TypeMap;
-  ValueMaterializerTy ValMaterializer;
-
-  /// Mapping of values from what they used to be in Src, to what they are now
-  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
-  /// due to the use of Value handles which the Linker doesn't actually need,
-  /// but this allows us to reuse the ValueMapper code.
-  ValueToValueMapTy ValueMap;
-
   SetVector<GlobalValue *> ValuesToLink;
-
-  DiagnosticHandlerFunction DiagnosticHandler;
+  StringSet<> Internalize;
 
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
@@ -413,64 +47,30 @@ class ModuleLinker {
   /// as part of a different backend compilation process.
   bool HasExportedFunctions = false;
 
-  /// Set to true when all global value body linking is complete (including
-  /// lazy linking). Used to prevent metadata linking from creating new
-  /// references.
-  bool DoneLinkingBodies = false;
-
-  bool HasError = false;
-
-public:
-  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
-               DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
-               const FunctionInfoIndex *Index = nullptr,
-               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
-      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
-        DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
-        ImportFunction(FunctionsToImport) {
-    assert((ImportIndex || !ImportFunction) &&
-           "Expect a FunctionInfoIndex when importing");
-    // If we have a FunctionInfoIndex but no function to import,
-    // then this is the primary module being compiled in a ThinLTO
-    // backend compilation, and we need to see if it has functions that
-    // may be exported to another backend compilation.
-    if (ImportIndex && !ImportFunction)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
-  }
-
-  bool run();
-  Value *materializeDeclFor(Value *V);
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old);
+  /// Used as the callback for lazy linking.
+  /// The mover has just hit GV and we have to decide if it, and other members
+  /// of the same comdat, should be linked. Every member to be linked is passed
+  /// to Add.
+  void addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add);
 
-private:
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
   bool shouldInternalizeLinkedSymbols() {
     return Flags & Linker::InternalizeLinkedSymbols;
   }
 
-  /// Handles cloning of a global values from the source module into
-  /// the destination module, including setting the attributes and visibility.
-  GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV,
-                                    const GlobalValue *DGV, bool ForDefinition);
-
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
 
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
 
-  /// Helper method for setting a message and returning an error code.
+  /// Should we have mover and linker error diag info?
   bool emitError(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
-    HasError = true;
+    Mover.getDiagnosticHandler()(LinkDiagnosticInfo(DS_Error, Message));
     return true;
   }
 
-  void emitWarning(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
-  }
-
   bool getComdatLeader(Module &M, StringRef ComdatName,
                        const GlobalVariable *&GVar);
   bool computeResultingSelectionKind(StringRef ComdatName,
@@ -488,6 +88,7 @@ class ModuleLinker {
   /// Given a global in the source module, return the global in the
   /// destination module that is being linked to, if any.
   GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    Module &DstM = Mover.getModule();
     // If the source has no name it can't link.  If it has local linkage,
     // there is no name match-up going on.
     if (!SrcGV->hasName() || GlobalValue::isLocalLinkage(getLinkage(SrcGV)))
@@ -507,29 +108,7 @@ class ModuleLinker {
     return DGV;
   }
 
-  void computeTypeMapping();
-
-  void upgradeMismatchedGlobalArray(StringRef Name);
-  void upgradeMismatchedGlobals();
-
   bool linkIfNeeded(GlobalValue &GV);
-  bool linkAppendingVarProto(GlobalVariable *DstGV,
-                             const GlobalVariable *SrcGV);
-
-  bool linkGlobalValueProto(GlobalValue *GV);
-  bool linkModuleFlagsMetadata();
-
-  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
-  bool linkFunctionBody(Function &Dst, Function &Src);
-  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
-
-  /// Functions that take care of cloning a specific global value type
-  /// into the destination module.
-  GlobalVariable *copyGlobalVariableProto(TypeMapTy &TypeMap,
-                                          const GlobalVariable *SGVar);
-  Function *copyFunctionProto(TypeMapTy &TypeMap, const Function *SF);
-  GlobalValue *copyGlobalAliasProto(TypeMapTy &TypeMap, const GlobalAlias *SGA);
 
   /// Helper methods to check if we are importing from or potentially
   /// exporting from the current source module.
@@ -545,6 +124,13 @@ class ModuleLinker {
   /// a local that is being promoted to global scope.
   std::string getName(const GlobalValue *SGV);
 
+  /// Process globals so that they can be used in ThinLTO. This includes
+  /// promoting local variables so that they can be reference externally by
+  /// thin lto imported globals and converting strong external globals to
+  /// available_externally.
+  void processGlobalsForThinLTO();
+  void processGlobalForThinLTO(GlobalValue &GV);
+
   /// Get the new linkage for SGV that should be used in the linked destination
   /// module. Specifically, for ThinLTO importing or exporting it may need
   /// to be adjusted.
@@ -560,7 +146,23 @@ class ModuleLinker {
   void setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
                      const GlobalValue *DGV = nullptr);
 
-  void linkNamedMDNodes();
+public:
+  ModuleLinker(IRMover &Mover, Module &SrcM, unsigned Flags,
+               const FunctionInfoIndex *Index = nullptr,
+               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
+      : Mover(Mover), SrcM(SrcM), Flags(Flags), ImportIndex(Index),
+        ImportFunction(FunctionsToImport) {
+    assert((ImportIndex || !ImportFunction) &&
+           "Expect a FunctionInfoIndex when importing");
+    // If we have a FunctionInfoIndex but no function to import,
+    // then this is the primary module being compiled in a ThinLTO
+    // backend compilation, and we need to see if it has functions that
+    // may be exported to another backend compilation.
+    if (ImportIndex && !ImportFunction)
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
+  }
+
+  bool run();
 };
 }
 
@@ -727,8 +329,10 @@ GlobalValue::LinkageTypes ModuleLinker::getLinkage(const GlobalValue *SGV) {
     // It would be incorrect to import an appending linkage variable,
     // since it would cause global constructors/destructors to be
     // executed multiple times. This should have already been handled
-    // by linkGlobalValueProto.
-    llvm_unreachable("Cannot import appending linkage variable");
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage here
+    // as this helper is called more widely in getLinkedToGlobal.
+    return GlobalValue::AppendingLinkage;
 
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
@@ -759,44 +363,6 @@ GlobalValue::LinkageTypes ModuleLinker::getLinkage(const GlobalValue *SGV) {
   llvm_unreachable("unknown linkage type");
 }
 
-/// Loop through the global variables in the src module and merge them into the
-/// dest module.
-GlobalVariable *
-ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
-                                      const GlobalVariable *SGVar) {
-  // No linking to be performed or linking from the source: simply create an
-  // identical version of the symbol over in the dest module... the
-  // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV =
-      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
-                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
-                         /*init*/ nullptr, getName(SGVar),
-                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-                         SGVar->getType()->getAddressSpace());
-
-  return NewDGV;
-}
-
-/// Link the function in the source module into the destination module if
-/// needed, setting up mapping information.
-Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
-                                          const Function *SF) {
-  // If there is no linkage to be performed or we are linking from the source,
-  // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()),
-                          GlobalValue::ExternalLinkage, getName(SF), &DstM);
-}
-
-/// Set up prototypes for any aliases that come over from the source module.
-GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
-                                                const GlobalAlias *SGA) {
-  // If there is no linkage to be performed or we're linking from the source,
-  // bring over SGA.
-  auto *Ty = TypeMap.get(SGA->getValueType());
-  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             GlobalValue::ExternalLinkage, getName(SGA), &DstM);
-}
-
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
                  GlobalValue::VisibilityTypes B) {
@@ -820,86 +386,6 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
   NewGV->setVisibility(Visibility);
 }
 
-GlobalValue *ModuleLinker::copyGlobalValueProto(TypeMapTy &TypeMap,
-                                                const GlobalValue *SGV,
-                                                const GlobalValue *DGV,
-                                                bool ForDefinition) {
-  GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
-    NewGV = copyGlobalVariableProto(TypeMap, SGVar);
-  } else if (auto *SF = dyn_cast<Function>(SGV)) {
-    NewGV = copyFunctionProto(TypeMap, SF);
-  } else {
-    if (ForDefinition)
-      NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
-    else
-      NewGV = new GlobalVariable(
-          DstM, TypeMap.get(SGV->getType()->getElementType()),
-          /*isConstant*/ false, GlobalValue::ExternalLinkage,
-          /*init*/ nullptr, getName(SGV),
-          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
-          SGV->getType()->getAddressSpace());
-  }
-
-  if (ForDefinition)
-    NewGV->setLinkage(getLinkage(SGV));
-  else if (SGV->hasAvailableExternallyLinkage() || SGV->hasWeakLinkage() ||
-           SGV->hasLinkOnceLinkage())
-    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
-
-  copyGVAttributes(NewGV, SGV);
-  setVisibility(NewGV, SGV, DGV);
-  return NewGV;
-}
-
-Value *ValueMaterializerTy::materializeDeclFor(Value *V) {
-  return ModLinker->materializeDeclFor(V);
-}
-
-Value *ModuleLinker::materializeDeclFor(Value *V) {
-  auto *SGV = dyn_cast<GlobalValue>(V);
-  if (!SGV)
-    return nullptr;
-
-  linkGlobalValueProto(SGV);
-  return ValueMap[SGV];
-}
-
-void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
-                                             GlobalValue *Old) {
-  return ModLinker->materializeInitFor(New, Old);
-}
-
-static bool shouldLazyLink(const GlobalValue &GV) {
-  return GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-         GV.hasAvailableExternallyLinkage();
-}
-
-void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
-  if (auto *F = dyn_cast<Function>(New)) {
-    if (!F->isDeclaration())
-      return;
-  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
-    if (V->hasInitializer())
-      return;
-  } else {
-    auto *A = cast<GlobalAlias>(New);
-    if (A->getAliasee())
-      return;
-  }
-
-  if (Old->isDeclaration())
-    return;
-
-  if (isPerformingImport() && !doImportAsDefinition(Old))
-    return;
-
-  if (!ValuesToLink.count(Old) && !shouldLazyLink(*Old))
-    return;
-
-  linkGlobalValueBody(*New, *Old);
-}
-
 bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
   const GlobalValue *GVal = M.getNamedValue(ComdatName);
@@ -925,6 +411,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
                                                  Comdat::SelectionKind Dst,
                                                  Comdat::SelectionKind &Result,
                                                  bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   // The ability to mix Comdat::SelectionKind::Any with
   // Comdat::SelectionKind::Largest is a behavior that comes from COFF.
   bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any ||
@@ -992,6 +479,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
 bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    Comdat::SelectionKind &Result,
                                    bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
   Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
@@ -1021,8 +509,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
-    // Caller should have already determined that we can't link from source
-    // when importing (see comments in linkGlobalValueProto).
+    // Should have prevented importing for appending linkage in linkIfNeeded.
     assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
@@ -1062,7 +549,12 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
       return false;
     }
     // If the Dest is weak, use the source linkage.
-    LinkFromSrc = Dest.hasExternalWeakLinkage();
+    if (Dest.hasExternalWeakLinkage()) {
+      LinkFromSrc = true;
+      return false;
+    }
+    // Link an available_externally over a declaration.
+    LinkFromSrc = !Src.isDeclaration() && Dest.isDeclaration();
     return false;
   }
 
@@ -1117,644 +609,6 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
                    "': symbol multiply defined!");
 }
 
-/// Loop over all of the linked values to compute type mappings.  For example,
-/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
-/// types 'Foo' but one got renamed when the module was loaded into the same
-/// LLVMContext.
-void ModuleLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM.globals()) {
-    GlobalValue *DGV = getLinkedToGlobal(&SGV);
-    if (!DGV)
-      continue;
-
-    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-      continue;
-    }
-
-    // Unify the element type of appending arrays.
-    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
-    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
-    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
-  }
-
-  for (GlobalValue &SGV : SrcM) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  for (GlobalValue &SGV : SrcM.aliases()) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  // Incorporate types by name, scanning all the types in the source module.
-  // At this point, the destination module may have a type "%foo = { i32 }" for
-  // example.  When the source module got loaded into the same LLVMContext, if
-  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
-  for (StructType *ST : Types) {
-    if (!ST->hasName())
-      continue;
-
-    // Check to see if there is a dot in the name followed by a digit.
-    size_t DotPos = ST->getName().rfind('.');
-    if (DotPos == 0 || DotPos == StringRef::npos ||
-        ST->getName().back() == '.' ||
-        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
-      continue;
-
-    // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
-    if (!DST)
-      continue;
-
-    // Don't use it if this actually came from the source module. They're in
-    // the same LLVMContext after all. Also don't use it unless the type is
-    // actually used in the destination module. This can happen in situations
-    // like this:
-    //
-    //      Module A                         Module B
-    //      --------                         --------
-    //   %Z = type { %A }                %B = type { %C.1 }
-    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
-    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
-    //   %C = type { i8* }               %B.3 = type { %C.1 }
-    //
-    // When we link Module B with Module A, the '%B' in Module B is
-    // used. However, that would then use '%C.1'. But when we process '%C.1',
-    // we prefer to take the '%C' version. So we are then left with both
-    // '%C.1' and '%C' being used for the same types. This leads to some
-    // variables using one type and some using the other.
-    if (TypeMap.DstStructTypesSet.hasType(DST))
-      TypeMap.addTypeMapping(DST, ST);
-  }
-
-  // Now that we have discovered all of the type equivalences, get a body for
-  // any 'opaque' types in the dest module that are now resolved.
-  TypeMap.linkDefinedTypeBodies();
-}
-
-static void upgradeGlobalArray(GlobalVariable *GV) {
-  ArrayType *ATy = cast<ArrayType>(GV->getType()->getElementType());
-  StructType *OldTy = cast<StructType>(ATy->getElementType());
-  assert(OldTy->getNumElements() == 2 && "Expected to upgrade from 2 elements");
-
-  // Get the upgraded 3 element type.
-  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
-  Type *Tys[3] = {OldTy->getElementType(0), OldTy->getElementType(1),
-                  VoidPtrTy};
-  StructType *NewTy = StructType::get(GV->getContext(), Tys, false);
-
-  // Build new constants with a null third field filled in.
-  Constant *OldInitC = GV->getInitializer();
-  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
-  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
-    // Invalid initializer; give up.
-    return;
-  std::vector<Constant *> Initializers;
-  if (OldInit && OldInit->getNumOperands()) {
-    Value *Null = Constant::getNullValue(VoidPtrTy);
-    for (Use &U : OldInit->operands()) {
-      ConstantStruct *Init = cast<ConstantStruct>(U.get());
-      Initializers.push_back(ConstantStruct::get(
-          NewTy, Init->getOperand(0), Init->getOperand(1), Null, nullptr));
-    }
-  }
-  assert(Initializers.size() == ATy->getNumElements() &&
-         "Failed to copy all array elements");
-
-  // Replace the old GV with a new one.
-  ATy = ArrayType::get(NewTy, Initializers.size());
-  Constant *NewInit = ConstantArray::get(ATy, Initializers);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
-      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
-      GV->isExternallyInitialized());
-  NewGV->copyAttributesFrom(GV);
-  NewGV->takeName(GV);
-  assert(GV->use_empty() && "program cannot use initializer list");
-  GV->eraseFromParent();
-}
-
-void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
-  // Look for the global arrays.
-  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM.getNamedValue(Name));
-  if (!DstGV)
-    return;
-  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM.getNamedValue(Name));
-  if (!SrcGV)
-    return;
-
-  // Check if the types already match.
-  auto *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-  auto *SrcTy =
-      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  if (DstTy == SrcTy)
-    return;
-
-  // Grab the element types.  We can only upgrade an array of a two-field
-  // struct.  Only bother if the other one has three-fields.
-  auto *DstEltTy = cast<StructType>(DstTy->getElementType());
-  auto *SrcEltTy = cast<StructType>(SrcTy->getElementType());
-  if (DstEltTy->getNumElements() == 2 && SrcEltTy->getNumElements() == 3) {
-    upgradeGlobalArray(DstGV);
-    return;
-  }
-  if (DstEltTy->getNumElements() == 3 && SrcEltTy->getNumElements() == 2)
-    upgradeGlobalArray(SrcGV);
-
-  // We can't upgrade any other differences.
-}
-
-void ModuleLinker::upgradeMismatchedGlobals() {
-  upgradeMismatchedGlobalArray("llvm.global_ctors");
-  upgradeMismatchedGlobalArray("llvm.global_dtors");
-}
-
-static void getArrayElements(const Constant *C,
-                             SmallVectorImpl<Constant *> &Dest) {
-  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
-
-  for (unsigned i = 0; i != NumElements; ++i)
-    Dest.push_back(C->getAggregateElement(i));
-}
-
-/// If there were any appending global variables, link them together now.
-/// Return true on error.
-bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                         const GlobalVariable *SrcGV) {
-  ArrayType *SrcTy =
-      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  Type *EltTy = SrcTy->getElementType();
-
-  if (DstGV) {
-    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-
-    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
-      return emitError(
-          "Linking globals named '" + SrcGV->getName() +
-          "': can only link appending global with another appending global!");
-
-    // Check to see that they two arrays agree on type.
-    if (EltTy != DstTy->getElementType())
-      return emitError("Appending variables with different element types!");
-    if (DstGV->isConstant() != SrcGV->isConstant())
-      return emitError("Appending variables linked with different const'ness!");
-
-    if (DstGV->getAlignment() != SrcGV->getAlignment())
-      return emitError(
-          "Appending variables with different alignment need to be linked!");
-
-    if (DstGV->getVisibility() != SrcGV->getVisibility())
-      return emitError(
-          "Appending variables with different visibility need to be linked!");
-
-    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
-      return emitError(
-          "Appending variables with different unnamed_addr need to be linked!");
-
-    if (StringRef(DstGV->getSection()) != SrcGV->getSection())
-      return emitError(
-          "Appending variables with different section name need to be linked!");
-  }
-
-  SmallVector<Constant *, 16> DstElements;
-  if (DstGV)
-    getArrayElements(DstGV->getInitializer(), DstElements);
-
-  SmallVector<Constant *, 16> SrcElements;
-  getArrayElements(SrcGV->getInitializer(), SrcElements);
-
-  StringRef Name = SrcGV->getName();
-  bool IsNewStructor =
-      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
-      cast<StructType>(EltTy)->getNumElements() == 3;
-  if (IsNewStructor)
-    SrcElements.erase(
-        std::remove_if(SrcElements.begin(), SrcElements.end(),
-                       [this](Constant *E) {
-                         auto *Key = dyn_cast<GlobalValue>(
-                             E->getAggregateElement(2)->stripPointerCasts());
-                         return Key && !ValuesToLink.count(Key) &&
-                                !shouldLazyLink(*Key);
-                       }),
-        SrcElements.end());
-  uint64_t NewSize = DstElements.size() + SrcElements.size();
-  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
-
-  // Create the new global variable.
-  GlobalVariable *NG = new GlobalVariable(
-      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
-      /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
-      SrcGV->getType()->getAddressSpace());
-
-  // Propagate alignment, visibility and section info.
-  copyGVAttributes(NG, SrcGV);
-
-  // Replace any uses of the two global variables with uses of the new
-  // global.
-  ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
-
-  for (auto *V : SrcElements) {
-    DstElements.push_back(
-        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-  }
-
-  NG->setInitializer(ConstantArray::get(NewType, DstElements));
-
-  if (DstGV) {
-    DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
-    DstGV->eraseFromParent();
-  }
-
-  return false;
-}
-
-bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
-  GlobalValue *DGV = getLinkedToGlobal(SGV);
-
-  // Handle the ultra special appending linkage case first.
-  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage())
-    return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
-                                 cast<GlobalVariable>(SGV));
-
-  bool LinkFromSrc = true;
-  Comdat *C = nullptr;
-  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
-
-  if (isPerformingImport() && !doImportAsDefinition(SGV)) {
-    LinkFromSrc = false;
-  } else if (const Comdat *SC = SGV->getComdat()) {
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    C = DstM.getOrInsertComdat(SC->getName());
-    C->setSelectionKind(SK);
-    if (SGV->hasLocalLinkage())
-      LinkFromSrc = true;
-  } else if (DGV) {
-    if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
-      return true;
-  }
-
-  if (!LinkFromSrc && DGV) {
-    // Make sure to remember this mapping.
-    ValueMap[SGV] = ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
-  }
-
-  if (DGV)
-    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
-
-  GlobalValue *NewGV;
-  if (!LinkFromSrc && DGV) {
-    NewGV = DGV;
-    // When linking from source we setVisibility from copyGlobalValueProto.
-    setVisibility(NewGV, SGV, DGV);
-  } else {
-    // If we are done linking global value bodies (i.e. we are performing
-    // metadata linking), don't link in the global value due to this
-    // reference, simply map it to null.
-    if (DoneLinkingBodies)
-      return false;
-
-    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
-  }
-
-  NewGV->setUnnamedAddr(HasUnnamedAddr);
-
-  if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C && LinkFromSrc)
-      NewGO->setComdat(C);
-
-    if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
-      NewGO->setAlignment(std::max(DGV->getAlignment(), SGV->getAlignment()));
-  }
-
-  if (auto *NewGVar = dyn_cast<GlobalVariable>(NewGV)) {
-    auto *DGVar = dyn_cast_or_null<GlobalVariable>(DGV);
-    auto *SGVar = dyn_cast<GlobalVariable>(SGV);
-    if (DGVar && SGVar && DGVar->isDeclaration() && SGVar->isDeclaration() &&
-        (!DGVar->isConstant() || !SGVar->isConstant()))
-      NewGVar->setConstant(false);
-  }
-
-  // Make sure to remember this mapping.
-  if (NewGV != DGV) {
-    if (DGV) {
-      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
-      DGV->eraseFromParent();
-    }
-    ValueMap[SGV] = NewGV;
-  }
-
-  return false;
-}
-
-/// Update the initializers in the Dest module now that all globals that may be
-/// referenced are in Dest.
-void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
-  // Figure out what the initializer looks like in the dest module.
-  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap,
-                              RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-}
-
-/// Copy the source function over into the dest function and fix up references
-/// to values. At this point we know that Dest is an external function, and
-/// that Src is not.
-bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
-  assert(Dst.isDeclaration() && !Src.isDeclaration());
-
-  // Materialize if needed.
-  if (std::error_code EC = Src.materialize())
-    return emitError(EC.message());
-
-  // Link in the prefix data.
-  if (Src.hasPrefixData())
-    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap,
-                               RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-
-  // Link in the prologue data.
-  if (Src.hasPrologueData())
-    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
-                                 RF_MoveDistinctMDs, &TypeMap,
-                                 &ValMaterializer));
-
-  // Link in the personality function.
-  if (Src.hasPersonalityFn())
-    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
-                                  RF_MoveDistinctMDs, &TypeMap,
-                                  &ValMaterializer));
-
-  // Go through and convert function arguments over, remembering the mapping.
-  Function::arg_iterator DI = Dst.arg_begin();
-  for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName()); // Copy the name over.
-
-    // Add a mapping to our mapping.
-    ValueMap[&Arg] = &*DI;
-    ++DI;
-  }
-
-  // Copy over the metadata attachments.
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-  Src.getAllMetadata(MDs);
-  for (const auto &I : MDs)
-    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_MoveDistinctMDs,
-                                         &TypeMap, &ValMaterializer));
-
-  // Splice the body of the source function into the dest function.
-  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
-
-  // At this point, all of the instructions and values of the function are now
-  // copied over.  The only problem is that they are still referencing values in
-  // the Source function as operands.  Loop through all of the operands of the
-  // functions and patch them up to point to the local versions.
-  for (BasicBlock &BB : Dst)
-    for (Instruction &I : BB)
-      RemapInstruction(&I, ValueMap,
-                       RF_IgnoreMissingEntries | RF_MoveDistinctMDs, &TypeMap,
-                       &ValMaterializer);
-
-  // There is no need to map the arguments anymore.
-  for (Argument &Arg : Src.args())
-    ValueMap.erase(&Arg);
-
-  Src.dematerialize();
-  return false;
-}
-
-void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
-  Constant *Aliasee = Src.getAliasee();
-  Constant *Val = MapValue(Aliasee, ValueMap, RF_MoveDistinctMDs, &TypeMap,
-                           &ValMaterializer);
-  Dst.setAliasee(Val);
-}
-
-bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
-  if (const Comdat *SC = Src.getComdat()) {
-    // To ensure that we don't generate an incomplete comdat group,
-    // we must materialize and map in any other members that are not
-    // yet materialized in Dst, which also ensures their definitions
-    // are linked in. Otherwise, linkonce and other lazy linked GVs will
-    // not be materialized if they aren't referenced.
-    for (auto *SGV : ComdatMembers[SC]) {
-      auto *DGV = cast_or_null<GlobalValue>(ValueMap[SGV]);
-      if (DGV && !DGV->isDeclaration())
-        continue;
-      MapValue(SGV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-    }
-  }
-  if (shouldInternalizeLinkedSymbols())
-    if (auto *DGV = dyn_cast<GlobalValue>(&Dst))
-      DGV->setLinkage(GlobalValue::InternalLinkage);
-  if (auto *F = dyn_cast<Function>(&Src))
-    return linkFunctionBody(cast<Function>(Dst), *F);
-  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
-    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
-    return false;
-  }
-  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
-  return false;
-}
-
-/// Insert all of the named MDNodes in Src into the Dest module.
-void ModuleLinker::linkNamedMDNodes() {
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
-    // Don't link module flags here. Do them separately.
-    if (&NMD == SrcModFlags)
-      continue;
-    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
-    // Add Src elements into Dest node.
-    for (const MDNode *op : NMD.operands())
-      DestNMD->addOperand(MapMetadata(
-          op, ValueMap, RF_MoveDistinctMDs | RF_NullMapMissingGlobalValues,
-          &TypeMap, &ValMaterializer));
-  }
-}
-
-/// Merge the linker flags in Src into the Dest module.
-bool ModuleLinker::linkModuleFlagsMetadata() {
-  // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  if (!SrcModFlags)
-    return false;
-
-  // If the destination module doesn't have module flags yet, then just copy
-  // over the source module's flags.
-  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
-  if (DstModFlags->getNumOperands() == 0) {
-    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
-      DstModFlags->addOperand(SrcModFlags->getOperand(I));
-
-    return false;
-  }
-
-  // First build a map of the existing module flags and requirements.
-  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
-  SmallSetVector<MDNode *, 16> Requirements;
-  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *Op = DstModFlags->getOperand(I);
-    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
-    MDString *ID = cast<MDString>(Op->getOperand(1));
-
-    if (Behavior->getZExtValue() == Module::Require) {
-      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
-    } else {
-      Flags[ID] = std::make_pair(Op, I);
-    }
-  }
-
-  // Merge in the flags from the source module, and also collect its set of
-  // requirements.
-  bool HasErr = false;
-  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *SrcOp = SrcModFlags->getOperand(I);
-    ConstantInt *SrcBehavior =
-        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
-    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
-    MDNode *DstOp;
-    unsigned DstIndex;
-    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
-    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
-
-    // If this is a requirement, add it and continue.
-    if (SrcBehaviorValue == Module::Require) {
-      // If the destination module does not already have this requirement, add
-      // it.
-      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
-        DstModFlags->addOperand(SrcOp);
-      }
-      continue;
-    }
-
-    // If there is no existing flag with this ID, just add it.
-    if (!DstOp) {
-      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
-      DstModFlags->addOperand(SrcOp);
-      continue;
-    }
-
-    // Otherwise, perform a merge.
-    ConstantInt *DstBehavior =
-        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
-    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
-
-    // If either flag has override behavior, handle it first.
-    if (DstBehaviorValue == Module::Override) {
-      // Diagnose inconsistent flags which both have override behavior.
-      if (SrcBehaviorValue == Module::Override &&
-          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting override values");
-      }
-      continue;
-    } else if (SrcBehaviorValue == Module::Override) {
-      // Update the destination flag to that of the source.
-      DstModFlags->setOperand(DstIndex, SrcOp);
-      Flags[ID].first = SrcOp;
-      continue;
-    }
-
-    // Diagnose inconsistent merge behavior types.
-    if (SrcBehaviorValue != DstBehaviorValue) {
-      HasErr |= emitError("linking module flags '" + ID->getString() +
-                          "': IDs have conflicting behaviors");
-      continue;
-    }
-
-    auto replaceDstValue = [&](MDNode *New) {
-      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
-      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
-      DstModFlags->setOperand(DstIndex, Flag);
-      Flags[ID].first = Flag;
-    };
-
-    // Perform the merge for standard behavior types.
-    switch (SrcBehaviorValue) {
-    case Module::Require:
-    case Module::Override:
-      llvm_unreachable("not possible");
-    case Module::Error: {
-      // Emit an error if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Warning: {
-      // Emit a warning if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitWarning("linking module flags '" + ID->getString() +
-                    "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Append: {
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      SmallVector<Metadata *, 8> MDs;
-      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
-      MDs.append(DstValue->op_begin(), DstValue->op_end());
-      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
-      break;
-    }
-    case Module::AppendUnique: {
-      SmallSetVector<Metadata *, 16> Elts;
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      Elts.insert(DstValue->op_begin(), DstValue->op_end());
-      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM.getContext(),
-                                  makeArrayRef(Elts.begin(), Elts.end())));
-      break;
-    }
-    }
-  }
-
-  // Check all of the requirements.
-  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
-    MDNode *Requirement = Requirements[I];
-    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
-    Metadata *ReqValue = Requirement->getOperand(1);
-
-    MDNode *Op = Flags[Flag].first;
-    if (!Op || Op->getOperand(2) != ReqValue) {
-      HasErr |= emitError("linking module flags '" + Flag->getString() +
-                          "': does not have the required value");
-      continue;
-    }
-  }
-
-  return HasErr;
-}
-
-// This function returns true if the triples match.
-static bool triplesMatch(const Triple &T0, const Triple &T1) {
-  // If vendor is apple, ignore the version number.
-  if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
-
-  return T0 == T1;
-}
-
-// This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple,
-                                const Triple &DstTriple) {
-  // If vendor is apple, pick the triple with the larger version number.
-  if (SrcTriple.getVendor() == Triple::Apple)
-    if (DstTriple.isOSVersionLT(SrcTriple))
-      return SrcTriple.str();
-
-  return DstTriple.str();
-}
-
 bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   GlobalValue *DGV = getLinkedToGlobal(&GV);
 
@@ -1802,6 +656,9 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
        GV.hasAvailableExternallyLinkage()))
     return false;
 
+  if (GV.isDeclaration())
+    return false;
+
   if (const Comdat *SC = GV.getComdat()) {
     bool LinkFromSrc;
     Comdat::SelectionKind SK;
@@ -1819,47 +676,49 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   return false;
 }
 
-bool ModuleLinker::run() {
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
-  if (DstM.getDataLayout().isDefault())
-    DstM.setDataLayout(SrcM.getDataLayout());
-
-  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
-    emitWarning("Linking two modules of different data layouts: '" +
-                SrcM.getModuleIdentifier() + "' is '" +
-                SrcM.getDataLayoutStr() + "' whereas '" +
-                DstM.getModuleIdentifier() + "' is '" +
-                DstM.getDataLayoutStr() + "'\n");
-  }
-
-  // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
-    DstM.setTargetTriple(SrcM.getTargetTriple());
-
-  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) {
+  // Add these to the internalize list
+  if (!GV.hasLinkOnceLinkage())
+    return;
 
-  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
-    emitWarning("Linking two modules of different target triples: " +
-                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
-                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
-                DstM.getTargetTriple() + "'\n");
+  if (shouldInternalizeLinkedSymbols())
+    Internalize.insert(GV.getName());
+  Add(GV);
 
-  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  const Comdat *SC = GV.getComdat();
+  if (!SC)
+    return;
+  for (GlobalValue *GV2 : ComdatMembers[SC]) {
+    if (!GV2->hasLocalLinkage() && shouldInternalizeLinkedSymbols())
+      Internalize.insert(GV2->getName());
+    Add(*GV2);
+  }
+}
 
-  // Append the module inline asm string.
-  if (!SrcM.getModuleInlineAsm().empty()) {
-    if (DstM.getModuleInlineAsm().empty())
-      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
-    else
-      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
-                              SrcM.getModuleInlineAsm());
+void ModuleLinker::processGlobalForThinLTO(GlobalValue &GV) {
+  if (GV.hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
+    GV.setName(getName(&GV));
+    GV.setLinkage(getLinkage(&GV));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+    if (isModuleExporting())
+      ValuesToLink.insert(&GV);
+    return;
   }
+  GV.setLinkage(getLinkage(&GV));
+}
 
-  // Loop over all of the linked values to compute type mappings.
-  computeTypeMapping();
+void ModuleLinker::processGlobalsForThinLTO() {
+  for (GlobalVariable &GV : SrcM.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : SrcM)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : SrcM.aliases())
+    processGlobalForThinLTO(GA);
+}
 
-  ComdatsChosen.clear();
+bool ModuleLinker::run() {
   for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
@@ -1871,9 +730,6 @@ bool ModuleLinker::run() {
     ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
   }
 
-  // Upgrade mismatched global arrays.
-  upgradeMismatchedGlobals();
-
   for (GlobalVariable &GV : SrcM.globals())
     if (const Comdat *SC = GV.getComdat())
       ComdatMembers[SC].push_back(&GV);
@@ -1900,143 +756,44 @@ bool ModuleLinker::run() {
     if (linkIfNeeded(GA))
       return true;
 
-  for (GlobalValue *GV : ValuesToLink) {
-    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-    if (HasError)
-      return true;
-  }
+  processGlobalsForThinLTO();
 
-  // Note that we are done linking global value bodies. This prevents
-  // metadata linking from creating new references.
-  DoneLinkingBodies = true;
+  for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
+    GlobalValue *GV = ValuesToLink[I];
+    const Comdat *SC = GV->getComdat();
+    if (!SC)
+      continue;
+    for (GlobalValue *GV2 : ComdatMembers[SC])
+      ValuesToLink.insert(GV2);
+  }
 
-  // Remap all of the named MDNodes in Src into the DstM module. We do this
-  // after linking GlobalValues so that MDNodes that reference GlobalValues
-  // are properly remapped.
-  linkNamedMDNodes();
+  if (shouldInternalizeLinkedSymbols()) {
+    for (GlobalValue *GV : ValuesToLink)
+      Internalize.insert(GV->getName());
+  }
 
-  // Merge the module flags into the DstM module.
-  if (linkModuleFlagsMetadata())
+  if (Mover.move(SrcM, ValuesToLink.getArrayRef(),
+                 [this](GlobalValue &GV, IRMover::ValueAdder Add) {
+                   addLazyFor(GV, Add);
+                 }))
     return true;
+  Module &DstM = Mover.getModule();
+  for (auto &P : Internalize) {
+    GlobalValue *GV = DstM.getNamedValue(P.first());
+    GV->setLinkage(GlobalValue::InternalLinkage);
+  }
 
   return false;
 }
 
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
-    : ETypes(E), IsPacked(P) {}
-
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
-    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
-  if (IsPacked != That.IsPacked)
-    return false;
-  if (ETypes != That.ETypes)
-    return false;
-  return true;
-}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
-  return !this->operator==(That);
-}
-
-StructType *Linker::StructTypeKeyInfo::getEmptyKey() {
-  return DenseMapInfo<StructType *>::getEmptyKey();
-}
-
-StructType *Linker::StructTypeKeyInfo::getTombstoneKey() {
-  return DenseMapInfo<StructType *>::getTombstoneKey();
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
-  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
-                      Key.IsPacked);
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const StructType *ST) {
-  return getHashValue(KeyTy(ST));
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
-    return false;
-  return LHS == KeyTy(RHS);
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const StructType *LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey())
-    return LHS == getEmptyKey();
-
-  if (RHS == getTombstoneKey())
-    return LHS == getTombstoneKey();
-
-  return KeyTy(LHS) == KeyTy(RHS);
-}
-
-void Linker::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-}
-
-void Linker::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-  bool Removed = OpaqueStructTypes.erase(Ty);
-  (void)Removed;
-  assert(Removed);
-}
-
-void Linker::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
-  assert(Ty->isOpaque());
-  OpaqueStructTypes.insert(Ty);
-}
-
-StructType *
-Linker::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
-                                               bool IsPacked) {
-  Linker::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
-  auto I = NonOpaqueStructTypes.find_as(Key);
-  if (I == NonOpaqueStructTypes.end())
-    return nullptr;
-  return *I;
-}
-
-bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
-  if (Ty->isOpaque())
-    return OpaqueStructTypes.count(Ty);
-  auto I = NonOpaqueStructTypes.find(Ty);
-  if (I == NonOpaqueStructTypes.end())
-    return false;
-  return *I == Ty;
-}
-
 Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
-    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
-  TypeFinder StructTypes;
-  StructTypes.run(M, true);
-  for (StructType *Ty : StructTypes) {
-    if (Ty->isOpaque())
-      IdentifiedStructTypes.addOpaque(Ty);
-    else
-      IdentifiedStructTypes.addNonOpaque(Ty);
-  }
-}
-
-Linker::Linker(Module &M)
-    : Linker(M, [this](const DiagnosticInfo &DI) {
-        Composite.getContext().diagnose(DI);
-      }) {}
+    : Mover(M, DiagnosticHandler) {}
 
 bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
                           DenseSet<const GlobalValue *> *FunctionsToImport) {
-  ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
-                         DiagnosticHandler, Flags, Index, FunctionsToImport);
-  bool RetCode = TheLinker.run();
-  Composite.dropTriviallyDeadConstantArrays();
-  return RetCode;
+  ModuleLinker TheLinker(Mover, Src, Flags, Index, FunctionsToImport);
+  return TheLinker.run();
 }
 
 //===----------------------------------------------------------------------===//
@@ -2055,9 +812,16 @@ bool Linker::linkModules(Module &Dest, Module &Src,
   return L.linkInModule(Src, Flags);
 }
 
-bool Linker::linkModules(Module &Dest, Module &Src, unsigned Flags) {
-  Linker L(Dest);
-  return L.linkInModule(Src, Flags);
+std::unique_ptr<Module>
+llvm::renameModuleForThinLTO(std::unique_ptr<Module> &M,
+                             const FunctionInfoIndex *Index,
+                             DiagnosticHandlerFunction DiagnosticHandler) {
+  std::unique_ptr<llvm::Module> RenamedModule(
+      new llvm::Module(M->getModuleIdentifier(), M->getContext()));
+  Linker L(*RenamedModule.get(), DiagnosticHandler);
+  if (L.linkInModule(*M.get(), llvm::Linker::Flags::None, Index))
+    return nullptr;
+  return RenamedModule;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCInstrDesc.cpp b/lib/MC/MCInstrDesc.cpp
index 5be2fa1b30b6..ee55f3eff3ac 100644
--- a/lib/MC/MCInstrDesc.cpp
+++ b/lib/MC/MCInstrDesc.cpp
@@ -53,7 +53,7 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI,
 
 bool MCInstrDesc::hasImplicitDefOfPhysReg(unsigned Reg,
                                           const MCRegisterInfo *MRI) const {
-  if (const uint16_t *ImpDefs = ImplicitDefs)
+  if (const MCPhysReg *ImpDefs = ImplicitDefs)
     for (; *ImpDefs; ++ImpDefs)
       if (*ImpDefs == Reg || (MRI && MRI->isSubRegister(Reg, *ImpDefs)))
         return true;
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 41e28698b1cc..dbedd73a4325 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -262,6 +262,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   DwarfCUIndexSection =
       Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
+  DwarfTUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_tu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps",
                                          0, SectionKind::getMetadata());
 
@@ -537,6 +540,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   // DWP Sections
   DwarfCUIndexSection =
       Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+  DwarfTUIndexSection =
+      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
 
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
@@ -725,6 +730,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata());
+  DwarfTUIndexSection = Ctx->getCOFFSection(
+      ".debug_tu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 8e8be8e52f63..61f7d749b968 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -4753,8 +4753,8 @@ bool AsmParser::parseMSInlineAsm(
     }
 
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
-    ArrayRef<uint16_t> ImpDefs(Desc.getImplicitDefs(),
-                               Desc.getNumImplicitDefs());
+    ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
+                                Desc.getNumImplicitDefs());
     ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
   }
 
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index e34c86542ab3..d1f79b225ee4 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -1403,8 +1403,7 @@ MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Finish(Trie);
   Finish.moveToEnd();
 
-  return iterator_range<export_iterator>(export_iterator(Start),
-                                         export_iterator(Finish));
+  return make_range(export_iterator(Start), export_iterator(Finish));
 }
 
 iterator_range<export_iterator> MachOObjectFile::exports() const {
@@ -1574,8 +1573,7 @@ MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
   MachORebaseEntry Finish(Opcodes, is64);
   Finish.moveToEnd();
 
-  return iterator_range<rebase_iterator>(rebase_iterator(Start),
-                                         rebase_iterator(Finish));
+  return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
 iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
@@ -1826,8 +1824,7 @@ MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
   MachOBindEntry Finish(Opcodes, is64, BKind);
   Finish.moveToEnd();
 
-  return iterator_range<bind_iterator>(bind_iterator(Start),
-                                       bind_iterator(Finish));
+  return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
 iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
@@ -1857,8 +1854,7 @@ MachOObjectFile::end_load_commands() const {
 
 iterator_range<MachOObjectFile::load_command_iterator>
 MachOObjectFile::load_commands() const {
-  return iterator_range<load_command_iterator>(begin_load_commands(),
-                                               end_load_commands());
+  return make_range(begin_load_commands(), end_load_commands());
 }
 
 StringRef
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 530be8ac044a..a965a1208b51 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -74,7 +74,8 @@ namespace llvm {
 
 std::string getPGOFuncName(StringRef RawFuncName,
                            GlobalValue::LinkageTypes Linkage,
-                           StringRef FileName) {
+                           StringRef FileName,
+                           uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
 
   // Function names may be prefixed with a binary '1' to indicate
   // that the backend should not modify the symbols due to any platform
@@ -96,8 +97,9 @@ std::string getPGOFuncName(StringRef RawFuncName,
   return FuncName;
 }
 
-std::string getPGOFuncName(const Function &F) {
-  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName());
+std::string getPGOFuncName(const Function &F, uint64_t Version) {
+  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName(),
+                        Version);
 }
 
 GlobalVariable *createPGOFuncNameVar(Module &M,
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index cfc968739806..da68242b4617 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -104,8 +104,9 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
   // 'reasonable' number of characters (up to profile magic size).
   size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
   StringRef buffer = Buffer.getBufferStart();
-  return count == 0 || std::all_of(buffer.begin(), buffer.begin() + count,
-    [](char c) { return ::isprint(c) || ::isspace(c); });
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return ::isprint(c) || ::isspace(c); });
 }
 
 std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
@@ -206,7 +207,6 @@ std::error_code RawInstrProfReader<IntPtrT>::readHeader(
 
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
-  ValueDataDelta = swap(Header.ValueDataDelta);
   auto DataSize = swap(Header.DataSize);
   auto CountersSize = swap(Header.CountersSize);
   auto NamesSize = swap(Header.NamesSize);
@@ -296,55 +296,35 @@ std::error_code RawInstrProfReader<IntPtrT>::readRawCounts(
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readValueProfilingData(
-    InstrProfRecord &Record) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
 
   Record.clearValueData();
-  if (!Data->Values || (ValueDataDelta == 0))
+  CurValueDataSize = 0;
+  // Need to match the logic in value profile dumper code in compiler-rt:
+  uint32_t NumValueKinds = 0;
+  for (uint32_t I = 0; I < IPVK_Last + 1; I++)
+    NumValueKinds += (Data->NumValueSites[I] != 0);
+
+  if (!NumValueKinds)
     return success();
 
-  // Read value data.
-  uint64_t NumVSites = 0;
-  for (uint32_t Kind = IPVK_First; Kind <= ValueKindLast; ++Kind)
-    NumVSites += swap(Data->NumValueSites[Kind]);
-  NumVSites += getNumPaddingBytes(NumVSites);
+  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+      ValueProfData::getValueProfData(ValueDataStart,
+                                      (const unsigned char *)ProfileEnd,
+                                      getDataEndianness());
 
-  auto VDataCounts = makeArrayRef(getValueDataCounts(Data->Values), NumVSites);
-  // Check bounds.
-  if (VDataCounts.data() < ValueDataStart ||
-      VDataCounts.data() + VDataCounts.size() >
-          reinterpret_cast<const uint8_t *>(ProfileEnd))
-    return error(instrprof_error::malformed);
+  if (VDataPtrOrErr.getError())
+    return VDataPtrOrErr.getError();
 
-  const InstrProfValueData *VDataPtr =
-      getValueData(swap(Data->Values) + NumVSites);
-  for (uint32_t Kind = IPVK_First; Kind <= ValueKindLast; ++Kind) {
-    NumVSites = swap(Data->NumValueSites[Kind]);
-    Record.reserveSites(Kind, NumVSites);
-    for (uint32_t VSite = 0; VSite < NumVSites; ++VSite) {
-
-      uint32_t VDataCount = VDataCounts[VSite];
-      if ((const char *)(VDataPtr + VDataCount) > ProfileEnd)
-        return error(instrprof_error::malformed);
-
-      std::vector<InstrProfValueData> CurrentValues;
-      CurrentValues.reserve(VDataCount);
-      for (uint32_t VIndex = 0; VIndex < VDataCount; ++VIndex) {
-        uint64_t TargetValue = swap(VDataPtr->Value);
-        uint64_t Count = swap(VDataPtr->Count);
-        CurrentValues.push_back({TargetValue, Count});
-        ++VDataPtr;
-      }
-      Record.addValueData(Kind, VSite, CurrentValues.data(),
-                          VDataCount, &FunctionPtrToNameMap);
-    }
-  }
+  VDataPtrOrErr.get()->deserializeTo(Record, &FunctionPtrToNameMap);
+  CurValueDataSize = VDataPtrOrErr.get()->getSize();
   return success();
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readNextRecord(
-    InstrProfRecord &Record) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (atEnd())
     if (std::error_code EC = readNextHeader(ProfileEnd))
       return EC;
@@ -362,7 +342,8 @@ std::error_code RawInstrProfReader<IntPtrT>::readNextRecord(
     return EC;
 
   // Read value data and set Record.
-  if (std::error_code EC = readValueProfilingData(Record)) return EC;
+  if (std::error_code EC = readValueProfilingData(Record))
+    return EC;
 
   // Iterate.
   advanceData();
@@ -463,7 +444,8 @@ std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
 
   Data = *RecordIterator;
 
-  if (Data.empty()) return instrprof_error::malformed;
+  if (Data.empty())
+    return instrprof_error::malformed;
 
   return instrprof_error::success;
 }
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index e71d0bae07bd..cdd98e8e8d03 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -693,15 +693,27 @@ SampleProfileReader::create(StringRef Filename, LLVMContext &C) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
+  return create(BufferOrError.get(), C);
+}
 
-  auto Buffer = std::move(BufferOrError.get());
+/// \brief Create a sample profile reader based on the format of the input data.
+///
+/// \param B The memory buffer to create the reader from (assumes ownership).
+///
+/// \param Reader The reader to instantiate according to \p Filename's format.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
   std::unique_ptr<SampleProfileReader> Reader;
-  if (SampleProfileReaderBinary::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderBinary(std::move(Buffer), C));
-  else if (SampleProfileReaderGCC::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderGCC(std::move(Buffer), C));
-  else if (SampleProfileReaderText::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderText(std::move(Buffer), C));
+  if (SampleProfileReaderBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderBinary(std::move(B), C));
+  else if (SampleProfileReaderGCC::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderGCC(std::move(B), C));
+  else if (SampleProfileReaderText::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderText(std::move(B), C));
   else
     return sampleprof_error::unrecognized_format;
 
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index c9f892334686..51feee5ad7d1 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -39,6 +39,8 @@ using namespace llvm;
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::write(StringRef FName,
                                                const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
   OS << FName << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
@@ -84,7 +86,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
   const auto &ret = NameTable.find(FName);
   if (ret == NameTable.end())
     return sampleprof_error::truncated_name_table;
-  encodeULEB128(ret->second, OS);
+  encodeULEB128(ret->second, *OutputStream);
   return sampleprof_error::success;
 }
 
@@ -112,6 +114,8 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
 
 std::error_code SampleProfileWriterBinary::writeHeader(
     const StringMap<FunctionSamples> &ProfileMap) {
+  auto &OS = *OutputStream;
+
   // Write file magic identifier.
   encodeULEB128(SPMagic(), OS);
   encodeULEB128(SPVersion(), OS);
@@ -134,6 +138,8 @@ std::error_code SampleProfileWriterBinary::writeHeader(
 
 std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
                                                      const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
   if (std::error_code EC = writeNameIdx(FName))
     return EC;
 
@@ -176,11 +182,11 @@ std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
 /// \returns true if the samples were written successfully, false otherwise.
 std::error_code SampleProfileWriterBinary::write(StringRef FName,
                                                  const FunctionSamples &S) {
-  encodeULEB128(S.getHeadSamples(), OS);
+  encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(FName, S);
 }
 
-/// \brief Create a sample profile writer based on the specified format.
+/// \brief Create a sample profile file writer based on the specified format.
 ///
 /// \param Filename The file to create.
 ///
@@ -192,12 +198,36 @@ std::error_code SampleProfileWriterBinary::write(StringRef FName,
 ErrorOr<std::unique_ptr<SampleProfileWriter>>
 SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   std::error_code EC;
+  std::unique_ptr<raw_ostream> OS;
+  if (Format == SPF_Binary)
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None));
+  else
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text));
+  if (EC)
+    return EC;
+
+  return create(OS, Format);
+}
+
+/// \brief Create a sample profile stream writer based on the specified format.
+///
+/// \param OS The output stream to store the profile data to.
+///
+/// \param Writer The writer to instantiate according to the specified format.
+///
+/// \param Format Encoding format for the profile file.
+///
+/// \returns an error code indicating the status of the created writer.
+ErrorOr<std::unique_ptr<SampleProfileWriter>>
+SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                            SampleProfileFormat Format) {
+  std::error_code EC;
   std::unique_ptr<SampleProfileWriter> Writer;
 
   if (Format == SPF_Binary)
-    Writer.reset(new SampleProfileWriterBinary(Filename, EC));
+    Writer.reset(new SampleProfileWriterBinary(OS));
   else if (Format == SPF_Text)
-    Writer.reset(new SampleProfileWriterText(Filename, EC));
+    Writer.reset(new SampleProfileWriterText(OS));
   else if (Format == SPF_GCC)
     EC = sampleprof_error::unsupported_writing_format;
   else
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index dd740384de38..7d7225671737 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -473,6 +473,16 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   return nullptr;
 }
 
+unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
+  return StringSwitch<unsigned>(MacinfoString)
+      .Case("DW_MACINFO_define", DW_MACINFO_define)
+      .Case("DW_MACINFO_undef", DW_MACINFO_undef)
+      .Case("DW_MACINFO_start_file", DW_MACINFO_start_file)
+      .Case("DW_MACINFO_end_file", DW_MACINFO_end_file)
+      .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext)
+      .Default(DW_MACINFO_invalid);
+}
+
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
   case DW_CFA_nop:                       return "DW_CFA_nop";
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 68f9dcf3fc2a..bc44bc5f2461 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,21 +25,21 @@
 namespace {
 using namespace llvm;
 
-static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
-                                    AArch64::X3, AArch64::X4, AArch64::X5,
-                                    AArch64::X6, AArch64::X7};
-static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
-                                    AArch64::H3, AArch64::H4, AArch64::H5,
-                                    AArch64::H6, AArch64::H7};
-static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
-                                    AArch64::S3, AArch64::S4, AArch64::S5,
-                                    AArch64::S6, AArch64::S7};
-static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
-                                    AArch64::D3, AArch64::D4, AArch64::D5,
-                                    AArch64::D6, AArch64::D7};
-static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
-                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
-                                    AArch64::Q6, AArch64::Q7};
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                     AArch64::X3, AArch64::X4, AArch64::X5,
+                                     AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                     AArch64::H3, AArch64::H4, AArch64::H5,
+                                     AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                     AArch64::S3, AArch64::S4, AArch64::S5,
+                                     AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                     AArch64::D3, AArch64::D4, AArch64::D5,
+                                     AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                     AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                     AArch64::Q6, AArch64::Q7};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -86,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   if (LocVT.SimpleTy == MVT::i64)
     RegList = XRegList;
   else if (LocVT.SimpleTy == MVT::f16)
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 948b9ddb5df6..66d92100e637 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -279,6 +279,15 @@ def CSR_AArch64_TLS_Darwin
                            FP,
                            (sequence "Q%u", 0, 31))>;
 
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sequence "D%u", 0, 31))>;
+
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
 def CSR_AArch64_TLS_ELF
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index b7849d5bbc26..cae2d5276296 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2275,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
-  AArch64CC::CondCode CC = AArch64CC::NE;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && isValueAvailable(CI)) {
       // Try to optimize or fold the cmp.
@@ -2307,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
 
       // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
       // instruction.
-      CC = getCompareCC(Predicate);
+      AArch64CC::CondCode CC = getCompareCC(Predicate);
       AArch64CC::CondCode ExtraCC = AArch64CC::AL;
       switch (Predicate) {
       default:
@@ -2335,37 +2334,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
           .addImm(CC)
           .addMBB(TBB);
 
-      finishCondBranch(BI->getParent(), TBB, FBB);
-      return true;
-    }
-  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
-    MVT SrcVT;
-    if (TI->hasOneUse() && isValueAvailable(TI) &&
-        isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
-      unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (!CondReg)
-        return false;
-      bool CondIsKill = hasTrivialKill(TI->getOperand(0));
-
-      // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64) {
-        CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
-                                             AArch64::sub_32);
-        CondIsKill = true;
-      }
-
-      unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
-      assert(ANDReg && "Unexpected AND instruction emission failure.");
-      emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
-      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-        std::swap(TBB, FBB);
-        CC = AArch64CC::EQ;
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
@@ -2383,20 +2351,23 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     } else
       FuncInfo.MBB->addSuccessorWithoutProb(Target);
     return true;
-  } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
-    // Fake request the condition, otherwise the intrinsic might be completely
-    // optimized away.
-    unsigned CondReg = getRegForValue(BI->getCondition());
-    if (!CondReg)
-      return false;
+  } else {
+    AArch64CC::CondCode CC = AArch64CC::NE;
+    if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+      // Fake request the condition, otherwise the intrinsic might be completely
+      // optimized away.
+      unsigned CondReg = getRegForValue(BI->getCondition());
+      if (!CondReg)
+        return false;
 
-    // Emit the branch.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
-      .addMBB(TBB);
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+        .addImm(CC)
+        .addMBB(TBB);
 
-    finishCondBranch(BI->getParent(), TBB, FBB);
-    return true;
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
@@ -2404,26 +2375,19 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     return false;
   bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
-  // We've been divorced from our compare!  Our block was split, and
-  // now our compare lives in a predecessor block.  We musn't
-  // re-compare here, as the children of the compare aren't guaranteed
-  // live across the block boundary (we *could* check for this).
-  // Regardless, the compare has been done in the predecessor block,
-  // and it left a value for us in a virtual register.  Ergo, we test
-  // the one-bit value left in the virtual register.
-  //
-  // FIXME: Optimize this with TBZW/TBZNW.
-  unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondRegIsKill, 1);
-  assert(ANDReg && "Unexpected AND instruction emission failure.");
-  emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
+  // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+  unsigned Opcode = AArch64::TBNZW;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
-    CC = AArch64CC::EQ;
+    Opcode = AArch64::TBZW;
   }
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
+  const MCInstrDesc &II = TII.get(Opcode);
+  unsigned ConstrainedCondReg
+    = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+      .addImm(0)
       .addMBB(TBB);
 
   finishCondBranch(BI->getParent(), TBB, FBB);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f0fb03451b2a..99b2edb38ef0 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -237,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+  }
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -1846,6 +1850,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
+  unsigned NumElts = InVT.getVectorNumElements();
+
+  // f16 vectors are promoted to f32 before a conversion.
+  if (InVT.getVectorElementType() == MVT::f16) {
+    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
@@ -6723,7 +6737,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
@@ -6749,7 +6763,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6992,7 +7006,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
   // Skip if we do not have NEON and skip illegal vector types.
   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
@@ -7078,7 +7092,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
   // Skip if we do not have NEON and skip illegal vector types.
   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 5eef82153e39..6ac2175e5035 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4315,7 +4315,7 @@ let Predicates = [HasNEON] in {
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -4329,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4339,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 }
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
@@ -4353,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4365,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
                                       asm, ".2d",
          [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
 }
@@ -4391,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
 // As above, but D sized elements unsupported.
 multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
         [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
         [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
                                       asm, ".8b",
       [(set (v8i8 V64:$dst),
             (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
                                       asm, ".16b",
       [(set (v16i8 V128:$dst),
             (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
                                       asm, ".4h",
       [(set (v4i16 V64:$dst),
             (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
                                       asm, ".8h",
       [(set (v8i16 V128:$dst),
             (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
                                       asm, ".2s",
       [(set (v2i32 V64:$dst),
             (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
                                       asm, ".4s",
       [(set (v4i32 V128:$dst),
             (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
@@ -4442,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
 // As above, but only B sized elements supported.
 multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
     [(set (v16i8 V128:$Rd),
           (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
 }
 
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
                                     SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+     [(set (v4f16 V64:$dst),
+           (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+     [(set (v8f16 V128:$dst),
+           (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
      [(set (v2f32 V64:$dst),
            (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
      [(set (v4f32 V128:$dst),
            (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
      [(set (v2f64 V128:$dst),
            (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
@@ -4498,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
 // As above, but D and B sized elements unsupported.
 multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
 }
@@ -4515,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
 // Logical three vector ops share opcode bits, and only use B sized elements.
 multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
                                   SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
                          [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
                          [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
 
@@ -4539,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
                                   string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
                    (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
              [(set (v16i8 V128:$dst),
                    (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
@@ -4583,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
+                            bits<2> size2, RegisterOperand regtype, string asm,
+                            string dstkind, string srckind, list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
@@ -4596,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4605,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
+                                bits<2> size2, RegisterOperand regtype,
+                                string asm, string dstkind, string srckind,
+                                list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
@@ -4618,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4628,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Supports B, H, and S element sizes.
 multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".8b", ".8b",
                           [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".16b", ".16b",
                           [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".4h", ".4h",
                           [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".8h", ".8h",
                           [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4686,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS {
 // Supports all element sizes.
 multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".4h", ".8b",
                [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".8h", ".16b",
                [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".2s", ".4h",
                [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".4s", ".8h",
                [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".1d", ".2s",
                [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".2d", ".4s",
                [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
 
 multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                           asm, ".4h", ".8b",
       [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
                                       (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                           asm, ".8h", ".16b",
       [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
                                       (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                           asm, ".2s", ".4h",
       [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
                                       (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                           asm, ".4s", ".8h",
       [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
                                       (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                           asm, ".1d", ".2s",
       [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
                                       (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                           asm, ".2d", ".4s",
       [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
                                       (v4i32 V128:$Rn)))]>;
@@ -4737,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
 // Supports all element sizes, except 1xD.
 multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                     asm, ".8b", ".8b",
     [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                     asm, ".16b", ".16b",
     [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                     asm, ".4h", ".4h",
     [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                     asm, ".8h", ".8h",
     [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                     asm, ".2s", ".2s",
     [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                     asm, ".4s", ".4s",
     [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
                                     asm, ".2d", ".2d",
     [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
     [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
     [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
     [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
     [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
     [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4789,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
 // Supports only B element sizes.
 multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
                           SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
 
@@ -4801,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
 // Supports only B and H element sizes.
 multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
                     [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
                     [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
 }
@@ -4819,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
 // as an extra opcode bit.
 multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
@@ -4833,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
 // Supports only S element size.
 multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4844,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4942,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 }
 
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype,
-                           string asm, string kind, string zero,
-                           ValueType dty, ValueType sty, SDNode OpNode>
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+                           bits<5> opcode, RegisterOperand regtype, string asm,
+                           string kind, string zero, ValueType dty,
+                           ValueType sty, SDNode OpNode>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
       "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
@@ -4958,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4968,49 +5023,69 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Comparisons support all element sizes, except 1xD.
 multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
                             SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
                                      asm, ".8b", "0",
                                      v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
                                      asm, ".16b", "0",
                                      v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
                                      asm, ".4h", "0",
                                      v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
                                      asm, ".8h", "0",
                                      v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
                                      asm, ".2s", "0",
                                      v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
                                      asm, ".4s", "0",
                                      v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
                                      asm, ".2d", "0",
                                      v2i64, v2i64, OpNode>;
 }
 
-// FP Comparisons support only S and D element sizes.
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
 multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
                               string asm, SDNode OpNode> {
 
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+                                     asm, ".4h", "0.0",
+                                     v4i16, v4f16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+                                     asm, ".8h", "0.0",
+                                     v8i16, v8f16, OpNode>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
                                      asm, ".2s", "0.0",
                                      v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
                                      asm, ".4s", "0.0",
                                      v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
                                      asm, ".2d", "0.0",
                                      v2i64, v2f64, OpNode>;
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
   def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
   def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
                   (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
   def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
                   (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
   def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
   def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
@@ -5561,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
                         RegisterClass regtype, string asm,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -5573,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{31-30} = 0b01;
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -5605,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
 
 multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
 }
 
 multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
             (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
@@ -5625,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
                              [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
 }
 
 multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
@@ -5640,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
                                      asm, []>;
 }
 
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
             (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
 }
 
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
                                 SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      []>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -5718,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, RegisterClass regtype2,
                         string asm, list<dag> pat>
   : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
@@ -5730,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5759,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
 
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, string asm, string zero>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "\t$Rd, $Rn, #" # zero, "", []>,
@@ -5770,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5792,7 +5878,7 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
 
 multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
 
   def : Pat<(v1i64 (OpNode FPR64:$Rn)),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5800,13 +5886,20 @@ multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+  }
 
   def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
   def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+  }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5814,7 +5907,7 @@ multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
                           SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
@@ -5822,27 +5915,34 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
 }
 
 multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+  }
 }
 
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
                                 [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+  }
 }
 
 multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
            [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
            [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
   }
 
   def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
@@ -5869,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
         [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
 }
 
 //----------------------------------------------------------------------------
@@ -5904,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
                                       asm, ".2d">;
 }
 
-multiclass SIMDFPPairwiseScalar<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+                                      asm, ".2h">;
+  }
+  def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
                                       asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+  def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
                                       asm, ".2d">;
 }
 
@@ -5963,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
                                    asm, ".4s", []>;
 }
 
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
                             Intrinsic intOp> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+                                   asm, ".4h",
+        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+  def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+                                   asm, ".8h",
+        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
   def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
                                    asm, ".4s",
         [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -6451,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> {
 // AdvSIMD modified immediate instructions
 //----------------------------------------------------------------------------
 
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
                           string asm, string op_string,
                           string cstr, list<dag> pattern>
   : I<oops, iops, asm, op_string, cstr, pattern>,
@@ -6463,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
   let Inst{29}    = op;
   let Inst{28-19} = 0b0111100000;
   let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
+  let Inst{11} = op2;
+  let Inst{10} = 1;
   let Inst{9-5}   = imm8{4-0};
   let Inst{4-0}   = Rd;
 }
 
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+  : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
                         !con((ins immtype:$imm8), opt_shift_iop), asm,
                         "{\t$Rd" # kind # ", $imm8" # opt_shift #
                         "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6484,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+  : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
                         !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
                         asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
                              "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6495,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
 class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
                                      RegisterOperand vectype, string asm,
                                      string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6520,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
 class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
                                          RegisterOperand vectype, string asm,
                                          string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_hw_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6585,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
 class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
                              RegisterOperand vectype, string asm,
                              string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins move_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<1> shift;
@@ -6593,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
   let Inst{12}    = shift;
 }
 
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
                                    RegisterOperand vectype,
                                    Operand imm_type, string asm,
                                    string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+  : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
                               asm, kind, pattern> {
   let Inst{15-12} = cmode;
 }
 
 class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
                                    list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+  : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
                         "\t$Rd, $imm8", "", pattern> {
   let Inst{15-12} = cmode;
   let DecoderMethod = "DecodeModImmInstruction";
@@ -6676,6 +6789,34 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4f16 V64:$Rd),
+        (OpNode (v4f16 V64:$Rn),
+         (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8f16 V128:$Rd),
+        (OpNode (v8f16 V128:$Rn),
+         (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
                                       V64, V64,
                                       V128, VectorIndexS,
@@ -6712,6 +6853,21 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h",
+    [(set (f16 FPR16Op:$Rd),
+          (OpNode (f16 FPR16Op:$Rn),
+                  (f16 (vector_extract (v8f16 V128_lo:$Rm),
+                                       VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
                                       asm, ".s", "", "", ".s",
@@ -6790,6 +6946,27 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
 }
 
 multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+                                          V128, V128,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
                                           V128, VectorIndexS,
                                           asm, ".2s", ".2s", ".2s", ".s", []> {
@@ -6816,6 +6993,16 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
 
   def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -7353,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
 }
 
 
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
                               FPR32, FPR32, vecshiftR32, asm, []> {
     let Inst{20-16} = imm{4-0};
@@ -7533,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
 
 multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
                               Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -7558,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   }
 }
 
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
                                   Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -8840,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in {
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                     RegisterOperand regtype, string asm, 
                                     string kind, list<dag> pattern>
-  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, 
                                 pattern> {
-  let Inst{21}=0;
 }
 multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
                                              SDPatternOperator Accum> {
@@ -9277,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">;
 def : TokenAlias<".4S", ".4s">;
 def : TokenAlias<".2D", ".2d">;
 def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
 def : TokenAlias<".B", ".b">;
 def : TokenAlias<".H", ".h">;
 def : TokenAlias<".S", ".s">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 881f55ebeef9..ed6e171caa98 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2857,29 +2857,29 @@ defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", fmaxnum>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", fmaxnan>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", fminnum>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", fminnan>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
             TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
             TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
@@ -2893,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
 def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
           (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
                       TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
 defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
@@ -3081,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmlt.2d\t$dst, $src1, $src2}",
                 (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmle.4h\t$dst, $src1, $src2}",
+                (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmle.8h\t$dst, $src1, $src2}",
+                (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmle.2s\t$dst, $src1, $src2}",
                 (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3091,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmle.2d\t$dst, $src1, $src2}",
                 (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmlt.4h\t$dst, $src1, $src2}",
+                (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmlt.8h\t$dst, $src1, $src2}",
+                (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmlt.2s\t$dst, $src1, $src2}",
                 (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3101,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmlt.2d\t$dst, $src1, $src2}",
                 (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|facle.4h\t$dst, $src1, $src2}",
+                (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|facle.8h\t$dst, $src1, $src2}",
+                (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|facle.2s\t$dst, $src1, $src2}",
                 (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3111,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|facle.2d\t$dst, $src1, $src2}",
                 (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|faclt.4h\t$dst, $src1, $src2}",
+                (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|faclt.8h\t$dst, $src1, $src2}",
+                (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|faclt.2s\t$dst, $src1, $src2}",
                 (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3132,19 +3164,19 @@ defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
                                      int_aarch64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
                                      int_aarch64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -3248,14 +3280,14 @@ defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
 defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
 defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
 defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
 defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
 defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
                                      int_aarch64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UCVTF  : SIMDFPTwoScalarCVT<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
 defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
@@ -3620,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">;
 //----------------------------------------------------------------------------
 
 defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDFPPairwiseScalar<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDFPPairwiseScalar<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDFPPairwiseScalar<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDFPPairwiseScalar<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDFPPairwiseScalar<1, 1, 0b01111, "fminp">;
+defm FADDP   : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
 def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
 def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -3811,6 +3843,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -3976,10 +4013,10 @@ defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
 defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
 defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
 defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
 
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
@@ -4226,15 +4263,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 // AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".2d",
                        [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64,  fpimm8,
                                               "fmov", ".2s",
                        [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".4s",
                        [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64,  fpimm8,
+                                              "fmov", ".4h",
+                       [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".8h",
+                       [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
 
 // AdvSIMD MOVI
 
@@ -4262,7 +4307,7 @@ def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
 // The movi_edit node has the immediate value already encoded, so we use
 // a plain imm0_255 in the pattern
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
                                                 simdimmtype10,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
@@ -4323,10 +4368,10 @@ def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
                             (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 
 // Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64,  imm0_255,
                                                  "movi", ".8b",
                        [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
                                                  "movi", ".16b",
                        [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
 
@@ -4526,10 +4571,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
 // Codegen patterns for the above. We don't put these directly on the
 // instructions because TableGen's type inference can't handle the truth.
 // Having the same base pattern for fp <--> int totally freaks it out.
@@ -4602,7 +4647,7 @@ defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
 //----------------------------------------------------------------------------
 defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
 defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
                                    int_aarch64_neon_vcvtfxs2fp>;
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
                                          int_aarch64_neon_rshrn>;
@@ -4637,7 +4682,7 @@ defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
 defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
                 TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+defm UCVTF   : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
                         int_aarch64_neon_vcvtfxu2fp>;
 defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
                                          int_aarch64_neon_uqrshrn>;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 1aef31baad20..763b2337de12 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -46,6 +46,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_SaveList;
   else
     return CSR_AArch64_AAPCS_SaveList;
 }
@@ -58,6 +60,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
+  if (CC == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_RegMask;
   else
     return CSR_AArch64_AAPCS_RegMask;
 }
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index b2efca023372..a8c8b176efa9 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64",
 // The lower 16 vector registers.  Some instructions can only take registers
 // in this range.
 def FPR128_lo : RegisterClass<"AArch64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
                               128, (trunc FPR128, 16)>;
 
 // Pairs, triples, and quads of 64-bit vector registers.
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 81402a854f6a..9af0e6444789 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -448,7 +448,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
@@ -538,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
@@ -547,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f0ad855ed5e6..394c8e78581f 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1921,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) {
       .Case(".h", true)
       .Case(".s", true)
       .Case(".d", true)
+      // Needed for fp16 scalar pairwise reductions
+      .Case(".2h", true)
       .Default(false);
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index a620e85101e6..fc0530dc4f24 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -92,8 +92,6 @@ enum TargetIndex {
 };
 }
 
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
 } // End namespace llvm
 
 namespace ShaderType {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b677caa6c2c6..b4403432f3fc 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -100,16 +100,6 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   }
 }
 
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
-  // This label is used to mark the end of the .text section.
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  OutStreamer->SwitchSection(TLOF.getTextSection());
-  MCSymbol *EndOfTextLabel =
-      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-  OutStreamer->EmitLabel(EndOfTextLabel);
-}
-
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 1aaef00a4dd0..817cbfc0c0eb 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,8 +99,6 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 
   void EmitFunctionBodyStart() override;
 
-  void EmitEndOfAsmFile(Module &M) override;
-
   void EmitFunctionEntryLabel() override;
 
   void EmitGlobalVariable(const GlobalVariable *GV) override;
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2e0283f040dd..8fe8a93dd758 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
       break;
     }
-    case MachineOperand::MO_TargetIndex: {
-      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7b0445db4df2..a22933ecf8a6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
-#include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
@@ -57,7 +57,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.getOS() == Triple::AMDHSA)
     return make_unique<AMDGPUHSATargetObjectFile>();
 
-  return make_unique<TargetLoweringObjectFileELF>();
+  return make_unique<AMDGPUTargetObjectFile>();
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
similarity index 73%
rename from lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
rename to lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 32f53edeb770..48bb2641e072 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -16,6 +16,25 @@
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                          SectionKind Kind,
+                                                          Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return TextSection;
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
 void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
                                            const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
similarity index 66%
rename from lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
rename to lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 9ea51ec9b29e..921341ebb897 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUHSATargetObjectFile.h - AMDGPU HSA Object Info ----*- C++ -*-===//
+//===-- AMDGPUTargetObjectFile.h - AMDGPU  Object Info ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,19 +9,26 @@
 ///
 /// \file
 /// \brief This file declares the AMDGPU-specific subclass of
-/// TargetLoweringObjectFile use for targeting the HSA-runtime.
+/// TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
-class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF {
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                      Mangler &Mang,
+                                      const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
 private:
   MCSection *DataGlobalAgentSection;
   MCSection *DataGlobalProgramSection;
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 64c9e1882e4f..3a51a5f5e103 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -19,7 +19,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAsmPrinter.cpp
   AMDGPUDiagnosticInfoUnsupported.cpp
   AMDGPUFrameLowering.cpp
-  AMDGPUHSATargetObjectFile.cpp
+  AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b119c6..60e8c8f3d303 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
     case AMDGPU::fixup_si_rodata: {
       uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      *Dst = Value;
-      break;
-    }
-
-    case AMDGPU::fixup_si_end_of_text: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // The value points to the last instruction in the text section, so we
-      // need to add 4 bytes to get to the start of the constants.
+      // We emit constant data at the end of the text section and generate its
+      // address using the following code sequence:
+      // s_getpc_b64 s[0:1]
+      // s_add_u32 s0, s0, $symbol
+      // s_addc_u32 s1, s1, 0
+      //
+      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+      // the fixup replaces $symbol with a literal constant, which is a
+      // pc-relative  offset from the encoding of the $symbol operand to the
+      // constant data.
+      //
+      // What we want here is an offset from the start of the s_add_u32
+      // instruction to the constant data, but since the encoding of $symbol
+      // starts 4 bytes after the start of the add instruction, we end up
+      // with an offset that is 4 bytes too small.  This requires us to
+      // add 4 to the fixup value before applying it.
       *Dst = Value + 4;
       break;
     }
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   0 },
-    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d67ffd9..59a9178082f6 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
   /// fixup for global addresses with constant initializers
   fixup_si_rodata,
 
-  /// fixup for offset from instruction to end of text section
-  fixup_si_end_of_text,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeba2b16..9eb3dadbc5e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   void operator=(const SIMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  MCContext &Ctx;
 
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   MCContext &ctx)
-    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+    : MCII(mcii), MRI(mri) { }
 
   ~SIMCCodeEmitter() override {}
 
@@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   if (MO.isExpr()) {
     const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind;
-    const MCSymbol *Sym =
-        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
-    if (&Expr->getSymbol() == Sym) {
-      // Add the offset to the beginning of the constant values.
-      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
-    } else {
-      // This is used for constant data stored in .rodata.
-     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    }
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
     Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
   }
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2cb801a707e1..8dc3934b8cbf 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1094,20 +1094,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
-  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
-  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(0, DL, MVT::i32));
-  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrLo, GA);
-  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
-                           SDValue(Lo.getNode(), 1));
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
 }
 
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
@@ -2436,13 +2424,41 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (Constraint == "r") {
-    switch(VT.SimpleTy) {
-      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
-      case MVT::i64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
-      case MVT::i32:
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 's':
+    case 'r':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      }
+
+    case 'v':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+      case 96:
+        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+      }
     }
   }
 
@@ -2463,3 +2479,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 's':
+    case 'v':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index b9f75cd11de0..4079be65d280 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -121,6 +121,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index a3a2d8c01eb5..65c4d032a510 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -762,26 +762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
-    // Add 32-bit offset from this instruction to the start of the constant data.
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
-            .addReg(RegLo)
-            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-            .addReg(RegHi)
-            .addImm(0)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
-            .addReg(AMDGPU::SCC, RegState::Implicit);
-    MI->eraseFromParent();
-    break;
-  }
   case AMDGPU::SGPR_USE:
     // This is just a placeholder for register allocation.
     MI->eraseFromParent();
@@ -835,6 +815,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     MI->eraseFromParent();
     break;
   }
+
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+    MachineFunction &MF = *MBB.getParent();
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+    // Add 32-bit offset from this instruction to the start of the
+    // constant data.
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+                           .addReg(RegLo)
+                           .addOperand(MI->getOperand(1)));
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                           .addReg(RegHi)
+                           .addImm(0));
+
+    llvm::finalizeBundle(MBB, Bundler.begin());
+
+    MI->eraseFromParent();
+    break;
+  }
   }
   return true;
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index d6c8a3d29513..c57d0c07aab1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -133,7 +133,8 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
 def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+                                                     SDTCisVT<0, i64>]>
 >;
 
 //===----------------------------------------------------------------------===//
@@ -366,6 +367,8 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
+def const_ga : Operand<iPTR>;
+
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 2cee993d751c..dcc74f77c146 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2039,8 +2039,8 @@ let Defs = [SCC] in {
 
 def SI_CONSTDATA_PTR : InstSI <
   (outs SReg_64:$dst),
-  (ins),
-  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
+  (ins const_ga:$ptr),
+  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
 > {
   let SALU = 1;
 }
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index dd33c3614b1a..a44dc830a673 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -585,6 +585,7 @@ def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureVFP3,
                                                          FeatureVFPOnlySP,
                                                          FeatureD16,
+                                                         FeatureFP16,
                                                          FeatureMP,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHWDivARM,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 67ebfa2b581d..61141c0031df 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -60,7 +60,7 @@ using namespace llvm;
 ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
-      InConstantPool(false) {}
+      InConstantPool(false), OptimizationGoals(-1) {}
 
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
@@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<ARMSubtarget>();
 
   SetupMachineFunction(MF);
+  const Function* F = MF.getFunction();
+  const TargetMachine& TM = MF.getTarget();
+
+  // Calculate this function's optimization goal.
+  unsigned OptimizationGoal;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    // For best debugging illusion, speed and small size sacrificed
+    OptimizationGoal = 6;
+  else if (F->optForMinSize())
+    // Aggressively for small size, speed and debug illusion sacrificed
+    OptimizationGoal = 4;
+  else if (F->optForSize())
+    // For small size, but speed and debugging illusion preserved
+    OptimizationGoal = 3;
+  else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    // Aggressively for speed, small size and debug illusion sacrificed
+    OptimizationGoal = 2;
+  else if (TM.getOptLevel() > CodeGenOpt::None)
+    // For speed, but small size and good debug illusion preserved
+    OptimizationGoal = 1;
+  else // TM.getOptLevel() == CodeGenOpt::None
+    // For good debugging, but speed and small size preserved
+    OptimizationGoal = 5;
+
+  // Combine a new optimization goal with existing ones.
+  if (OptimizationGoals == -1) // uninitialized goals
+    OptimizationGoals = OptimizationGoal;
+  else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+    OptimizationGoals = 0;
 
   if (Subtarget->isTargetCOFF()) {
-    bool Internal = MF.getFunction()->hasInternalLinkage();
+    bool Internal = F->hasInternalLinkage();
     COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
                                             : COFF::IMAGE_SYM_CLASS_EXTERNAL;
     int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
@@ -506,6 +535,16 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+
+  // The last attribute to be emitted is ABI_optimization_goals
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  if (OptimizationGoals > 0)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+  OptimizationGoals = -1;
+
+  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
@@ -798,8 +837,6 @@ void ARMAsmPrinter::emitAttributes() {
   else if (STI.hasVirtualization())
     ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
                       ARMBuildAttrs::AllowVirtualization);
-
-  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index fb925f162f70..ed7be2de51ca 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// labels used for ARMv4t thumb code to make register indirect calls.
   SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
 
+  /// OptimizationGoals - Maintain a combined optimization goal for all
+  /// functions in a module: one of Tag_ABI_optimization_goals values,
+  /// -1 if uninitialized, 0 if conflicting goals
+  int OptimizationGoals;
+
 public:
   explicit ARMAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer);
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 3d216c0ed04a..a731d00883a1 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
-static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+static const MCPhysReg RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
 
-static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
-                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
-                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
-                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
-static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
-static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+static const MCPhysReg SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                      ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 
 
 // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
@@ -203,7 +203,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
   unsigned StackAlign = DL.getStackAlignment();
   unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
 
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   switch (LocVT.SimpleTy) {
   case MVT::i32: {
     RegList = RRegList;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 175107450fc0..9bdf823c85bd 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -3036,7 +3036,7 @@ bool ARMFastISel::fastLowerArguments() {
   }
 
 
-  static const uint16_t GPRArgRegs[] = {
+  static const MCPhysReg GPRArgRegs[] = {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index cd73021dfc62..024244092a34 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -548,8 +548,11 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
+                                               N.getOperand(0), NewMulConst)
+                                   .getNode()),
+                        0);
       replaceDAGValue(N.getOperand(1), NewMulConst);
-      BaseReg = N;
       Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
                                                           PowerOfTwo),
                                       SDLoc(N), MVT::i32);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 33f74a3ba9fd..cc9656aa0b4f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4139,6 +4139,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
   Results.push_back(Read.getOperand(0));
 }
 
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4158,6 +4208,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(0, dl, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
@@ -11684,7 +11739,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -11710,7 +11765,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -12053,8 +12108,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
   // Skip if we do not have NEON and skip illegal vector types and vector types
   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
@@ -12143,8 +12198,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
   // Skip if we do not have NEON and skip illegal vector types and vector types
   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 4c7107aee6a2..2aa9475e6f47 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -5655,16 +5655,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
 def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                   (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
-                   (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                   (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
-                   (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 2aea73a6336e..050cd1a445ad 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -540,19 +540,23 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 582a057e9234..c1520119ef21 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -478,12 +478,12 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 3e7da07b4aad..a94ca7d1c419 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitAttribute(unsigned Attribute, unsigned Value) override;
   void emitTextAttribute(unsigned Attribute, StringRef String) override;
   void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
-                            StringRef StrinValue) override;
+                            StringRef StringValue) override;
   void emitArch(unsigned Arch) override;
   void emitArchExtension(unsigned ArchExt) override;
   void emitObjectArch(unsigned Arch) override;
@@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
 class ARMTargetELFStreamer : public ARMTargetStreamer {
 private:
   // This structure holds all attributes, accounting for
-  // their string/numeric value, so we can later emmit them
+  // their string/numeric value, so we can later emit them
   // in declaration order, keeping all in the same vector
   struct AttributeItem {
     enum {
@@ -254,7 +254,7 @@ class ARMTargetELFStreamer : public ARMTargetStreamer {
     } Type;
     unsigned Tag;
     unsigned IntValue;
-    StringRef StringValue;
+    std::string StringValue;
 
     static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
       // The conformance tag must be emitted first when serialised
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index fd96af6cb6e0..8771c68e5931 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -406,9 +406,6 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
   if (AFI->getArgRegsSaveSize())
     return true;
 
-  // FIXME: this doesn't make sense, and the following patch will remove it.
-  if (!STI.hasV4TOps()) return false;
-
   // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
   for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
     if (CSI.getReg() == ARM::LR)
@@ -532,10 +529,32 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
                        .addReg(PopReg, RegState::Kill));
   }
 
+  bool AddBx = false;
   if (MBBI == MBB.end()) {
     MachineInstr& Pop = MBB.back();
     assert(Pop.getOpcode() == ARM::tPOP);
     Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR));
+  } else if (MBBI->getOpcode() == ARM::tPOP_RET) {
+    // We couldn't use the direct restoration above, so
+    // perform the opposite conversion: tPOP_RET to tPOP.
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+    unsigned Popped = 0;
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::PC) {
+        MIB.addOperand(MO);
+        if (!MO.isImplicit())
+          Popped++;
+      }
+    // Is there anything left to pop?
+    if (!Popped)
+      MBB.erase(MIB.getInstr());
+    // Erase the old instruction.
+    MBB.erase(MBBI);
+    MBBI = MBB.end();
+    AddBx = true;
   }
 
   assert(PopReg && "Do not know how to get LR");
@@ -554,14 +573,20 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     return true;
   }
 
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(PopReg, RegState::Kill));
-
+  if (AddBx && !TemporaryReg) {
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
+                       .addReg(PopReg, RegState::Kill));
+  } else {
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(ARM::LR, RegState::Define)
+                       .addReg(PopReg, RegState::Kill));
+  }
   if (TemporaryReg) {
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
                        .addReg(PopReg, RegState::Define)
                        .addReg(TemporaryReg, RegState::Kill));
+    if (AddBx)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET)));
   }
 
   return true;
@@ -628,7 +653,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       if (isVarArg)
         continue;
       // ARMv4T requires BX, see emitEpilogue
-      if (STI.hasV4TOps() && !STI.hasV5TOps())
+      if (!STI.hasV5TOps())
         continue;
       Reg = ARM::PC;
       (*MIB).setDesc(TII.get(ARM::tPOP_RET));
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index f38fe1904055..bcd0e5751258 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -218,7 +218,7 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
 }
 
 static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
-  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+  for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
     if (*Regs == ARM::CPSR)
       return true;
   return false;
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 5e78762b994a..a8622a96527c 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -554,13 +554,13 @@ struct HexagonOperand : public MCParsedAsmOperand {
   void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(CE->getValue() << 6));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
   void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(CE->getValue() << 6));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
   StringRef getToken() const {
@@ -1592,7 +1592,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     //   not use the other opcode as it is a legacy artifact of TD files.
     int64_t Value;
     if (MO.getExpr()->evaluateAsAbsolute(Value)) {
-      // if the the operand can fit within a 7:2 field
+      // if the operand can fit within a 7:2 field
       if (Value < (1 << 8) && Value >= -(1 << 8)) {
         SMLoc myLoc = Operands[2]->getStartLoc();
         // # is left in startLoc in the case of ##
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 1db59e1dd99d..4a9c3413cb29 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -471,7 +471,7 @@ extern const MCInstrDesc HexagonInsts[];
 }
 
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        ArrayRef<uint16_t> Table) {
+                                        ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
     Inst.addOperand(MCOperand::createReg(Table[RegNo]));
     return MCDisassembler::Success;
@@ -489,7 +489,7 @@ static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
-  static const uint16_t IntRegDecoderTable[] = {
+  static const MCPhysReg IntRegDecoderTable[] = {
       Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
       Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
       Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
@@ -498,13 +498,13 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
       Hexagon::R30, Hexagon::R31};
 
-  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t VecRegDecoderTable[] = {
+  static const MCPhysReg VecRegDecoderTable[] = {
       Hexagon::V0,  Hexagon::V1,  Hexagon::V2,  Hexagon::V3,  Hexagon::V4,
       Hexagon::V5,  Hexagon::V6,  Hexagon::V7,  Hexagon::V8,  Hexagon::V9,
       Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
@@ -513,25 +513,25 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
       Hexagon::V30, Hexagon::V31};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
 }
 
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t DoubleRegDecoderTable[] = {
+  static const MCPhysReg DoubleRegDecoderTable[] = {
       Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
       Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
       Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
       Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t VecDblRegDecoderTable[] = {
+  static const MCPhysReg VecDblRegDecoderTable[] = {
       Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
       Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
       Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
@@ -543,25 +543,25 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t /*Address*/,
                                                 const void *Decoder) {
-  static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
-                                                 Hexagon::P2, Hexagon::P3};
+  static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+                                                  Hexagon::P2, Hexagon::P3};
 
-  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
 }
 
 static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t /*Address*/,
                                                    const void *Decoder) {
-  static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
-                                                    Hexagon::Q2, Hexagon::Q3};
+  static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+                                                     Hexagon::Q2, Hexagon::Q3};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
 }
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
                                                const void *Decoder) {
-  static const uint16_t CtrlRegDecoderTable[] = {
+  static const MCPhysReg CtrlRegDecoderTable[] = {
     Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
     Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
     Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
@@ -582,7 +582,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
-  static const uint16_t CtrlReg64DecoderTable[] = {
+  static const MCPhysReg CtrlReg64DecoderTable[] = {
       Hexagon::C1_0,   Hexagon::NoRegister,
       Hexagon::C3_2,   Hexagon::NoRegister,
       Hexagon::C7_6,   Hexagon::NoRegister,
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 19769258ee89..8a6d6555d901 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -191,29 +191,23 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
     unsigned IgnoreCount = 0;
 
-    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII) {
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
       if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
           MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
         ++IgnoreCount;
-      else {
+      else
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
-      }
-    }
   }
-  else {
+  else
     HexagonLowerToMC(MCII, MI, MCB, *this);
-    HexagonMCInstrInfo::padEndloop(OutStreamer->getContext(), MCB);
-  }
-  // Examine the packet and try to find instructions that can be converted
-  // to compounds.
-  HexagonMCInstrInfo::tryCompound(MCII, OutStreamer->getContext(), MCB);
-  // Examine the packet and convert pairs of instructions to duplex
-  // instructions when possible.
-  SmallVector<DuplexCandidate, 8> possibleDuplexes;
-  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
-  HexagonMCShuffle(MCII, *Subtarget, OutStreamer->getContext(), MCB,
-                   possibleDuplexes);
-  EmitToStreamer(*OutStreamer, MCB);
+
+  bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+      MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+  assert(Ok);
+  (void)Ok;
+  if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+    return;
+  OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index b4ebd9140e75..c059d566709e 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -120,10 +120,10 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
   // First, get the implicit defs and uses for this instruction.
   unsigned Opc = MI->getOpcode();
   const MCInstrDesc &D = HII->get(Opc);
-  if (const uint16_t *R = D.ImplicitDefs)
+  if (const MCPhysReg *R = D.ImplicitDefs)
     while (*R)
       expandReg(*R++, Defs);
-  if (const uint16_t *R = D.ImplicitUses)
+  if (const MCPhysReg *R = D.ImplicitUses)
     while (*R)
       expandReg(*R++, Uses);
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 04f5b6649293..f82fe7699e84 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -316,18 +316,18 @@ static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
                              MVT LocVT, CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-    static const uint16_t VecLstS[] = { Hexagon::V0, Hexagon::V1,
-                                        Hexagon::V2, Hexagon::V3,
-                                        Hexagon::V4, Hexagon::V5,
-                                        Hexagon::V6, Hexagon::V7,
-                                        Hexagon::V8, Hexagon::V9,
-                                        Hexagon::V10, Hexagon::V11,
-                                        Hexagon::V12, Hexagon::V13,
-                                        Hexagon::V14, Hexagon::V15};
-    static const uint16_t VecLstD[] = { Hexagon::W0, Hexagon::W1,
-                                        Hexagon::W2, Hexagon::W3,
-                                        Hexagon::W4, Hexagon::W5,
-                                        Hexagon::W6, Hexagon::W7};
+    static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1,
+                                         Hexagon::V2, Hexagon::V3,
+                                         Hexagon::V4, Hexagon::V5,
+                                         Hexagon::V6, Hexagon::V7,
+                                         Hexagon::V8, Hexagon::V9,
+                                         Hexagon::V10, Hexagon::V11,
+                                         Hexagon::V12, Hexagon::V13,
+                                         Hexagon::V14, Hexagon::V15};
+    static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1,
+                                         Hexagon::W2, Hexagon::W3,
+                                         Hexagon::W4, Hexagon::W5,
+                                         Hexagon::W6, Hexagon::W7};
   auto &MF = State.getMachineFunction();
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   bool UseHVX = HST.useHVXOps();
@@ -1520,7 +1520,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
       Subtarget(ST) {
   bool IsV4 = !Subtarget.hasV5TOps();
   auto &HRI = *Subtarget.getRegisterInfo();
-  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+  bool UseHVX = Subtarget.useHVXOps();
+  bool UseHVXSgl = Subtarget.useHVXSglOps();
+  bool UseHVXDbl = Subtarget.useHVXDblOps();
 
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
@@ -1808,17 +1810,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
   if (UseHVX) {
-    if(!UseHVXDbl) {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom);
-    }
-    else {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom);
+    if (UseHVXSgl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64,  Custom);
+    } else if (UseHVXDbl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8,  Custom);
       setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64,  Custom);
+    } else {
+      llvm_unreachable("Unrecognized HVX mode");
     }
   }
   // Subtarget-specific operation actions.
@@ -2042,6 +2045,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VCMPWEQ:       return "HexagonISD::VCMPWEQ";
   case HexagonISD::VCMPWGT:       return "HexagonISD::VCMPWGT";
   case HexagonISD::VCMPWGTU:      return "HexagonISD::VCMPWGTU";
+  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
   case HexagonISD::VSHLH:         return "HexagonISD::VSHLH";
   case HexagonISD::VSHLW:         return "HexagonISD::VSHLW";
   case HexagonISD::VSPLATB:       return "HexagonISD::VSPLTB";
@@ -2211,8 +2215,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned Size = VT.getSizeInBits();
 
-  // A vector larger than 64 bits cannot be represented in Hexagon.
-  // Expand will split the vector.
+  // Only handle vectors of 64 bits or shorter.
   if (Size > 64)
     return SDValue();
 
@@ -2346,58 +2349,61 @@ SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  bool UseHVX = Subtarget.useHVXOps();
   EVT VT = Op.getValueType();
   unsigned NElts = Op.getNumOperands();
-  SDValue Vec = Op.getOperand(0);
-  EVT VecVT = Vec.getValueType();
-  SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64);
-  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
-                                DAG.getConstant(32, dl, MVT::i64));
-  SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64);
-
-  ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
-  ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted);
-
-  if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v2i16 to a single v4i16.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+  SDValue Vec0 = Op.getOperand(0);
+  EVT VecVT = Vec0.getValueType();
+  unsigned Width = VecVT.getSizeInBits();
+
+  if (NElts == 2) {
+    MVT ST = VecVT.getSimpleVT();
+    // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+    // into a single v8i8.
+    if (ST == MVT::v2i16 || ST == MVT::v4i8)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+    if (UseHVX) {
+      assert((Width ==  64*8 && Subtarget.useHVXSglOps()) ||
+             (Width == 128*8 && Subtarget.useHVXDblOps()));
+      SDValue Vec1 = Op.getOperand(1);
+      MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+      MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+      SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+      SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+      SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+      return DAG.getNode(ISD::BITCAST, dl, VT, VC);
     }
   }
 
-  if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v4i8 to a single v8i8.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
-    }
-  }
+  if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+    return SDValue();
+
+  SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+  SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+  SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+  // Create the "width" part of the argument to insert_rp/insertp_rp.
+  SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+  SDValue V = C0;
 
   for (unsigned i = 0, e = NElts; i != e; ++i) {
-    unsigned OpIdx = NElts - i - 1;
-    SDValue Operand = Op.getOperand(OpIdx);
+    unsigned N = NElts-i-1;
+    SDValue OpN = Op.getOperand(N);
 
-    if (VT.getSizeInBits() == 64 &&
-        Operand.getValueType().getSizeInBits() == 32) {
+    if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) {
       SDValue C = DAG.getConstant(0, dl, MVT::i32);
-      Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+      OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
     }
-
-    SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
-    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
-    SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
-    const SDValue Ops[] = {ConstVal, Operand, Combined};
-
+    SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
     if (VT.getSizeInBits() == 32)
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
     else
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
 
 SDValue
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 64033d95ee3c..b6d39fe91728 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -80,6 +80,7 @@ bool isPositiveHalfWord(SDNode *N);
       INSERTRP,
       EXTRACTU,
       EXTRACTURP,
+      VCOMBINE,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3dc49337ecb5..2862468563c2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1475,7 +1475,7 @@ bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const {
 }
 
 
-// Return true if the the instruction is a compund branch instruction.
+// Return true if the instruction is a compund branch instruction.
 bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const {
   return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch());
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
index 394df0fdd6e3..897ada081534 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -1535,6 +1535,20 @@ let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
 defm V6_vcombine :
      T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
 
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+      SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+                                  (v16i32 VectorRegs:$Vt))),
+         (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+                                  (v32i32 VecDblRegs:$Vt))),
+         (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+
 let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
 defm V6_vsathub :
      T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
@@ -1872,7 +1886,7 @@ defm V6_vasrhbrndsat :
      V6_vasrhbrndsat_enc;
 }
 
-// Assemlber mapped -- alias?
+// Assembler mapped -- alias?
 //defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
 let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
 defm V6_vshuffvdd :
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 6415c6cc7906..20c4ab112b5f 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -342,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
       return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
                    : Hexagon::J4_cmpgtui_t_jumpnv_nt;
 
+    case Hexagon::C4_cmpneq:
+      return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+                   : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplte:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+                     : Hexagon::J4_cmplt_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+                   : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplteu:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+                     : Hexagon::J4_cmpltu_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+                   : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
     default:
        llvm_unreachable("Could not find matching New Value Jump instruction.");
   }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index af67481e4f47..47a6f8636276 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -90,12 +90,16 @@ namespace HexagonII {
     PostInc        = 6   // Post increment addressing mode
   };
 
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
   enum class MemAccessSize {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
     WordAccess = 3,             // Word access instruction (memw).
-    DoubleWordAccess = 4        // Double word access instruction (memd)
+    DoubleWordAccess = 4,       // Double word access instruction (memd)
+                    // 5,       // We do not have a 16 byte vector access.
+    Vector64Access = 7,         // 64 Byte vector access instruction (vmem).
+    Vector128Access = 8         // 128 Byte vector access instruction (vmem).
   };
 
   // MCInstrDesc TSFlags
@@ -175,7 +179,7 @@ namespace HexagonII {
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
     MemAccessSizePos = 43,
-    MemAccesSizeMask = 0x7,
+    MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
     TakenPos = 47,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index c11abc1f42f6..46b7b41fec3b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -85,32 +85,33 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     }
 
   // Get implicit register definitions.
-  const uint16_t* ImpDefs = MCID.getImplicitDefs();
-  for (unsigned i = 0; i < MCID.getNumImplicitDefs(); ++i) {
-    unsigned R = ImpDefs[i];
+  if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+    for (; *ImpDef; ++ImpDef) {
+      unsigned R = *ImpDef;
 
-    if (Hexagon::R31 != R && MCID.isCall())
-      // Any register other than the LR and the PC are actually volatile ones
-      // as defined by the ABI, not modified implicitly by the call insn.
-      continue;
-    if (Hexagon::PC == R)
-      // Branches are the only insns that can change the PC,
-      // otherwise a read-only register.
-      continue;
+      if (Hexagon::R31 != R && MCID.isCall())
+        // Any register other than the LR and the PC are actually volatile ones
+        // as defined by the ABI, not modified implicitly by the call insn.
+        continue;
+      if (Hexagon::PC == R)
+        // Branches are the only insns that can change the PC,
+        // otherwise a read-only register.
+        continue;
 
-    if (Hexagon::USR_OVF == R)
-      // Many insns change the USR implicitly, but only one or another flag.
-      // The instruction table models the USR.OVF flag, which can be implicitly
-      // modified more than once, but cannot be modified in the same packet
-      // with an instruction that modifies is explicitly. Deal with such situ-
-      // ations individually.
-      SoftDefs.insert(R);
-    else if (isPredicateRegister(R) && HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
-      // Include implicit late predicates.
-      LatePreds.insert(R);
-    else
-      Defs[R].insert(PredSense(PredReg, isTrue));
-  }
+      if (Hexagon::USR_OVF == R)
+        // Many insns change the USR implicitly, but only one or another flag.
+        // The instruction table models the USR.OVF flag, which can be implicitly
+        // modified more than once, but cannot be modified in the same packet
+        // with an instruction that modifies is explicitly. Deal with such situ-
+        // ations individually.
+        SoftDefs.insert(R);
+      else if (isPredicateRegister(R) &&
+               HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+        // Include implicit late predicates.
+        LatePreds.insert(R);
+      else
+        Defs[R].insert(PredSense(PredReg, isTrue));
+    }
 
   // Figure out explicit register definitions.
   for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 35e490afe41f..c2c6275e7e8d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -334,7 +334,7 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   // The only relocs left should be GP relative:
   default:
     if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses;
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
            ++ImpUses) {
         if (*ImpUses == Hexagon::GP) {
           switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 7adc0ba51da3..e6842076db2a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -45,8 +45,7 @@ void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
 iterator_range<MCInst::const_iterator>
 HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
   assert(isBundle(MCI));
-  return iterator_range<MCInst::const_iterator>(
-      MCI.begin() + bundleInstructionsOffset, MCI.end());
+  return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
 }
 
 size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 45e1909ede5a..6ceb848ba20c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -95,6 +95,60 @@ unsigned HexagonResource::setWeight(unsigned s) {
   return (Weight);
 }
 
+HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL;
+
+bool HexagonCVIResource::SetUp = HexagonCVIResource::setup();
+
+bool HexagonCVIResource::setup() {
+  assert(!TUL);
+  TUL = new (TypeUnitsAndLanes);
+
+  (*TUL)[HexagonII::TypeCVI_VA] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+  (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_ST] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+
+  return true;
+}
+
+HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s,
+                                       MCInst const *id)
+    : HexagonResource(s) {
+  unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+  if (TUL->count(T)) {
+    // For an HVX insn.
+    Valid = true;
+    setUnits((*TUL)[T].first);
+    setLanes((*TUL)[T].second);
+    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+  } else {
+    // For core insns.
+    Valid = false;
+    setUnits(0);
+    setLanes(0);
+    setLoad(false);
+    setStore(false);
+  }
+}
+
 HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
@@ -109,7 +163,7 @@ void HexagonShuffler::reset() {
 
 void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
                              unsigned S, bool X) {
-  HexagonInstr PI(ID, Extender, S, X);
+  HexagonInstr PI(MCII, ID, Extender, S, X);
 
   Packet.push_back(PI);
 }
@@ -128,6 +182,8 @@ bool HexagonShuffler::check() {
   // Number of memory operations, loads, solo loads, stores, solo stores, single
   // stores.
   unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  // Number of HVX loads, HVX stores.
+  unsigned CVIloads = 0, CVIstores = 0;
   // Number of duplex insns, solo insns.
   unsigned duplex = 0, solo = 0;
   // Number of insns restricting other insns in the packet to A and X types,
@@ -170,6 +226,12 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeJ:
       ++jumps;
       break;
+    case HexagonII::TypeCVI_VM_VP_LDU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_LD:
+    case HexagonII::TypeCVI_VM_TMP_LD:
+    case HexagonII::TypeCVI_VM_CUR_LD:
+      ++CVIloads;
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
@@ -178,6 +240,11 @@ bool HexagonShuffler::check() {
       if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
         ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
       break;
+    case HexagonII::TypeCVI_VM_STU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_ST:
+    case HexagonII::TypeCVI_VM_NEW_ST:
+      ++CVIstores;
     case HexagonII::TypeST:
       ++stores;
       ++memory;
@@ -205,9 +272,9 @@ bool HexagonShuffler::check() {
   }
 
   // Check if the packet is legal.
-  if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) ||
-      (solo && size() > 1) || (onlyAX && neitherAnorX > 1) ||
-      (onlyAX && xtypeFloat)) {
+  if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+      (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+      (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
     Error = SHUFFLE_ERROR_INVALID;
     return false;
   }
@@ -338,6 +405,19 @@ bool HexagonShuffler::check() {
         return false;
       }
   }
+  // Verify the CVI slot subscriptions.
+  {
+    HexagonUnitAuction AuctionCVI;
+
+    std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+    for (iterator I = begin(); I != end(); ++I)
+      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+          Error = SHUFFLE_ERROR_SLOTS;
+          return false;
+        }
+  }
 
   Error = SHUFFLE_SUCCESS;
   return true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 6355c3275a38..174f10fb2580 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -51,6 +51,44 @@ class HexagonResource {
   };
 };
 
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+  typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+  typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+  // Available HVX slots.
+  enum {
+    CVI_NONE = 0,
+    CVI_XLANE = 1 << 0,
+    CVI_SHIFT = 1 << 1,
+    CVI_MPY0 = 1 << 2,
+    CVI_MPY1 = 1 << 3
+  };
+
+  static bool SetUp;
+  static bool setup();
+  static TypeUnitsAndLanes *TUL;
+
+  // Count of adjacent slots that the insn requires to be executed.
+  unsigned Lanes;
+  // Flag whether the insn is a load or a store.
+  bool Load, Store;
+  // Flag whether the HVX resources are valid.
+  bool Valid;
+
+  void setLanes(unsigned l) { Lanes = l; };
+  void setLoad(bool f = true) { Load = f; };
+  void setStore(bool f = true) { Store = f; };
+
+public:
+  HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id);
+
+  bool isValid() const { return (Valid); };
+  unsigned getLanes() const { return (Lanes); };
+  bool mayLoad() const { return (Load); };
+  bool mayStore() const { return (Store); };
+};
+
 // Handle to an insn used by the shuffling algorithm.
 class HexagonInstr {
   friend class HexagonShuffler;
@@ -58,12 +96,14 @@ class HexagonInstr {
   MCInst const *ID;
   MCInst const *Extender;
   HexagonResource Core;
+  HexagonCVIResource CVI;
   bool SoloException;
 
 public:
-  HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s,
-               bool x = false)
-      : ID(id), Extender(Extender), Core(s), SoloException(x){};
+  HexagonInstr(MCInstrInfo const &MCII, MCInst const *id,
+               MCInst const *Extender, unsigned s, bool x = false)
+      : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id),
+        SoloException(x){};
 
   MCInst const *getDesc() const { return (ID); };
 
@@ -79,6 +119,10 @@ class HexagonInstr {
   static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
     return (HexagonResource::lessUnits(A.Core, B.Core));
   };
+  // Check if the handles are in ascending order by HVX slots.
+  static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.CVI, B.CVI));
+  };
 };
 
 // Bundle shuffler.
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 44f665a86c66..14e63ad0c2be 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3647,6 +3647,15 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm5_Lsl2:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected both 7-bit unsigned immediate and multiple of 4");
+  case Match_UImm6_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
+  case Match_UImm8_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 8-bit unsigned immediate");
+  case Match_UImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit unsigned immediate");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index dfc54d75b361..f9e6a3c2f6ca 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -66,9 +66,9 @@ class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
 }
 class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm6,
                                                  MipsExt>;
-class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm6,
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
                                                   MipsExt>;
-class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm6,
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
                                                   MipsExt>;
 
 class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index c36a45acbf79..82d2c8ee9905 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -770,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
 //
 //===----------------------------------------------------------------------===//
 
-// f32 comparisons supported via another comparison
-def : MipsPat<(setone f32:$lhs, f32:$rhs),
-              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f32:$lhs, f32:$rhs),
-              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-
-// f64 comparisons supported via another comparison
-def : MipsPat<(setone f64:$lhs, f64:$rhs),
-              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f64:$lhs, f64:$rhs),
-              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
 
 // i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+                          Instruction SLTiOp, Instruction SLTiuOp,
+                          Instruction SELEQZOp, Instruction SELNEZOp,
+                          SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+              (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+              (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+              (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+              (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+                      immZExt16, i32>, ISA_MIPS32R6;
+
 def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              (OR (SELNEZ i32:$t, i32:$cond),
+                  (SELEQZ i32:$f, i32:$cond))>,
               ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
-                      i32:$f),
-              (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
-                      i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-
 def : MipsPat<(select i32:$cond, i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
-              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+              (SELNEZ i32:$t, i32:$cond)>,
+              ISA_MIPS32R6;
 def : MipsPat<(select i32:$cond, immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
-              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+              (SELEQZ i32:$f, i32:$cond)>,
+              ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 0992186a087f..b0cf96dbd75d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -275,11 +275,11 @@ def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 let AdditionalPredicates = [NotInMicroMips] in {
   def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
   def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
-  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32>, EXT_FM<2>;
 }
 
 def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
-def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>;
 def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index e0b317e9bf2d..c07edef86731 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -394,8 +394,17 @@ class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
   let DiagnosticType = "UImm" # Bits # "_" # Offset;
 }
 
+def ConstantUImm10AsmOperandClass
+    : ConstantUImmAsmOperandClass<10, []>;
+def ConstantUImm8AsmOperandClass
+    : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>;
+def ConstantUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<6, [ConstantUImm8AsmOperandClass]>;
+def ConstantUImm5Plus32AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
 def ConstantUImm5Plus32NormalizeAsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [], 32> {
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> {
+  let Name = "ConstantUImm5_32_Norm";
   // We must also subtract 32 when we render the operand.
   let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
 }
@@ -403,19 +412,20 @@ def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
   let Name = "UImm5Lsl2";
   let RenderMethod = "addImmOperands";
   let PredicateMethod = "isScaledUImm<5, 2>";
-  let SuperClasses = [];
+  let SuperClasses = [ConstantUImm6AsmOperandClass];
   let DiagnosticType = "UImm5_Lsl2";
 }
 def ConstantUImm5ReportUImm6AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, []> {
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> {
   let Name = "ConstantUImm5_0_Report_UImm6";
   let DiagnosticType = "UImm5_0_Report_UImm6";
 }
 def ConstantUImm5AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, []>;
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>;
 def ConstantUImm4AsmOperandClass
     : ConstantUImmAsmOperandClass<
           4, [ConstantUImm5AsmOperandClass,
+              ConstantUImm5Plus32AsmOperandClass,
               ConstantUImm5Plus32NormalizeAsmOperandClass]>;
 def ConstantUImm3AsmOperandClass
     : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>;
@@ -484,17 +494,6 @@ def simm32      : Operand<i32>;
 def uimm20      : Operand<i32> {
 }
 
-def MipsUImm10AsmOperand : AsmOperandClass {
-  let Name = "UImm10";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseImm";
-  let PredicateMethod = "isUImm<10>";
-}
-
-def uimm10      : Operand<i32> {
-  let ParserMatchClass = MipsUImm10AsmOperand;
-}
-
 def simm16_64   : Operand<i64> {
   let DecoderMethod = "DecodeSimm16";
 }
@@ -506,7 +505,7 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operands
-foreach I = {1, 2, 3, 4, 5} in
+foreach I = {1, 2, 3, 4, 5, 6, 8, 10} in
   def uimm # I : Operand<i32> {
     let PrintMethod = "printUnsignedImm";
     let ParserMatchClass =
@@ -520,6 +519,11 @@ def uimm2_plus1 : Operand<i32> {
   let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
 }
 
+def uimm5_plus32 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
+}
+
 def uimm5_plus32_normalize : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
   let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
@@ -550,10 +554,6 @@ def uimm5_64_report_uimm6 : Operand<i64> {
   let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
 }
 
-def uimm6 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-}
-
 def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index b74e967e6817..eacfcec78bc7 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -78,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> {
   let PrintMethod = "printUnsignedImm8";
 }
 
-def uimm8 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def simm5 : Operand<i32>;
 
 def vsplat_uimm1 : Operand<vAny> {
@@ -1177,47 +1173,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
-// This class is deprecated and will be removed soon.
-class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -1502,13 +1465,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
 }
 
 class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+  dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
   string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
   list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
-                                              immZExt6:$n,
+                                              Imm:$n,
                                               ROWS:$ws,
                                               immz:$n2))];
   InstrItinClass Itinerary = itin;
@@ -2327,13 +2291,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
 class INSERT_FD_VIDX64_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
 
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
                                          MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
                                          MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
                                          MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
                                          MSA128DOpnd>;
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2542,23 +2506,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
 class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
 class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
 
-class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
-                                           MSA128BOpnd>;
-class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
-                                           MSA128HOpnd>;
-class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
-                                           MSA128WOpnd>;
-class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
-                                           MSA128DOpnd>;
-
-class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
-                                           MSA128BOpnd>;
-class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
-                                           MSA128HOpnd>;
-class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
-                                           MSA128WOpnd>;
-class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
-                                           MSA128DOpnd>;
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
 class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2633,14 +2597,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
 class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
 class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
 
-class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
-                                           MSA128BOpnd>;
-class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
-                                           MSA128HOpnd>;
-class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
-                                           MSA128WOpnd>;
-class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
-                                           MSA128DOpnd>;
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
 class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2661,14 +2625,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
 class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
 class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
 
-class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
-                                           MSA128BOpnd>;
-class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
-                                           MSA128HOpnd>;
-class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
-                                           MSA128WOpnd>;
-class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
-                                           MSA128DOpnd>;
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index df6e08db8d35..c31ababafbe7 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(PowerPCCommonTableGen)
 
 add_llvm_target(PowerPCCodeGen
+  PPCBoolRetToInt.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCTRLoops.cpp
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index e157fd37c6e1..a259ed3fd327 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -45,10 +45,12 @@ namespace llvm {
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
+  FunctionPass *createPPCBoolRetToIntPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
+  void initializePPCBoolRetToIntPass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 000000000000..7920240bc2b9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,253 @@
+//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+          "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+          "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+          "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+
+  static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+    SmallPtrSet<Value *, 8> Defs;
+    SmallVector<Value *, 8> WorkList;
+    WorkList.push_back(V);
+    Defs.insert(V);
+    while (!WorkList.empty()) {
+      Value *Curr = WorkList.back();
+      WorkList.pop_back();
+      if (User *CurrUser = dyn_cast<User>(Curr))
+        for (auto &Op : CurrUser->operands())
+          if (Defs.insert(Op).second)
+            WorkList.push_back(Op);
+    }
+    return Defs;
+  }
+
+  // Translate a i1 value to an equivalent i32 value:
+  static Value *translate(Value *V) {
+    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+    if (Constant *C = dyn_cast<Constant>(V))
+      return ConstantExpr::getZExt(C, Int32Ty);
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      // Temporarily set the operands to 0. We'll fix this later in
+      // runOnUse.
+      Value *Zero = Constant::getNullValue(Int32Ty);
+      PHINode *Q =
+        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+      for (unsigned i = 0; i < P->getNumOperands(); ++i)
+        Q->addIncoming(Zero, P->getIncomingBlock(i));
+      return Q;
+    }
+
+    Argument *A = dyn_cast<Argument>(V);
+    Instruction *I = dyn_cast<Instruction>(V);
+    assert((A || I) && "Unknown value type");
+
+    auto InstPt =
+      A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+    return new ZExtInst(V, Int32Ty, "", InstPt);
+  }
+
+  typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+  // A PHINode is Promotable if:
+  // 1. Its type is i1 AND
+  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // AND
+  // 3. All of its operands are Constant or Argument or
+  //    CallInst or PHINode AND
+  // 4. All of its PHINode uses are Promotable AND
+  // 5. All of its PHINode operands are Promotable
+  static PHINodeSet getPromotablePHINodes(const Function &F) {
+    PHINodeSet Promotable;
+    // Condition 1
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (const PHINode *P = dyn_cast<PHINode>(&I))
+          if (P->getType()->isIntegerTy(1))
+            Promotable.insert(P);
+
+    SmallVector<const PHINode *, 8> ToRemove;
+    for (const auto &P : Promotable) {
+      // Condition 2 and 3
+      auto IsValidUser = [] (const Value *V) -> bool {
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+        isa<DbgInfoIntrinsic>(V);
+      };
+      auto IsValidOperand = [] (const Value *V) -> bool {
+        return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+        isa<PHINode>(V);
+      };
+      const auto &Users = P->users();
+      const auto &Operands = P->operands();
+      if (!std::all_of(Users.begin(), Users.end(), IsValidUser) ||
+          !std::all_of(Operands.begin(), Operands.end(), IsValidOperand))
+        ToRemove.push_back(P);
+    }
+
+    // Iterate to convergence
+    auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+      const PHINode *Phi = dyn_cast<PHINode>(V);
+      return !Phi || Promotable.count(Phi);
+    };
+    while (!ToRemove.empty()) {
+      for (auto &User : ToRemove)
+        Promotable.erase(User);
+      ToRemove.clear();
+
+      for (const auto &P : Promotable) {
+        // Condition 4 and 5
+        const auto &Users = P->users();
+        const auto &Operands = P->operands();
+        if (!std::all_of(Users.begin(), Users.end(), IsPromotable) ||
+            !std::all_of(Operands.begin(), Operands.end(), IsPromotable))
+          ToRemove.push_back(P);
+      }
+    }
+
+    return Promotable;
+  }
+
+  typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+  static char ID;
+  PPCBoolRetToInt() : FunctionPass(ID) {
+    initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) {
+    PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+    B2IMap Bool2IntMap;
+    bool Changed = false;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (ReturnInst *R = dyn_cast<ReturnInst>(&I))
+          if (F.getReturnType()->isIntegerTy(1))
+            Changed |=
+              runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          for (auto &U : CI->operands())
+            if (U->getType()->isIntegerTy(1))
+              Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+      }
+    }
+
+    return Changed;
+  }
+
+  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+                       B2IMap &BoolToIntMap) {
+    auto Defs = findAllDefs(U);
+
+    // If the values are all Constants or Arguments, don't bother
+    if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>))
+      return false;
+
+    // Presently, we only know how to handle PHINode, Constant, and Arguments.
+    // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension
+    // could also be handled in the future.
+    for (const auto &V : Defs)
+      if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V))
+        return false;
+
+    for (const auto &V : Defs)
+      if (const PHINode *P = dyn_cast<PHINode>(V))
+        if (!PromotablePHINodes.count(P))
+          return false;
+
+    if (isa<ReturnInst>(U.getUser()))
+      ++NumBoolRetPromotion;
+    if (isa<CallInst>(U.getUser()))
+      ++NumBoolCallPromotion;
+    ++NumBoolToIntPromotion;
+
+    for (const auto &V : Defs)
+      if (!BoolToIntMap.count(V))
+        BoolToIntMap[V] = translate(V);
+
+    // Replace the operands of the translated instructions. There were set to
+    // zero in the translate function.
+    for (auto &Pair : BoolToIntMap) {
+      User *First = dyn_cast<User>(Pair.first);
+      User *Second = dyn_cast<User>(Pair.second);
+      assert((!First || Second) && "translated from user to non-user!?");
+      if (First)
+        for (unsigned i = 0; i < First->getNumOperands(); ++i)
+          Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+    }
+
+    Value *IntRetVal = BoolToIntMap[U];
+    Type *Int1Ty = Type::getInt1Ty(U->getContext());
+    Instruction *I = cast<Instruction>(U.getUser());
+    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    U.set(BackToBool);
+
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+                "Convert i1 constants to i32 if they are returned",
+                false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 174deb88bc5c..beab844c6025 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
-static const uint16_t VRRegNo[] = {
+static const MCPhysReg VRRegNo[] = {
  PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
  PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
  PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2261b71c5aa9..4dfa1650c1ad 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4180,16 +4180,24 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
-    // If this is a load or store with a zero offset, we may be able to
-    // fold an add-immediate into the memory operation.
-    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
-        N->getConstantOperandVal(FirstOp) != 0)
+    // If this is a load or store with a zero offset, or within the alignment,
+    // we may be able to fold an add-immediate into the memory operation.
+    // The check against alignment is below, as it can't occur until we check
+    // the arguments to N
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
       continue;
 
     SDValue Base = N->getOperand(FirstOp + 1);
     if (!Base.isMachineOpcode())
       continue;
 
+    // On targets with fusion, we don't want this to fire and remove a fusion
+    // opportunity, unless a) it results in another fusion opportunity or
+    // b) optimizing for size.
+    if (PPCSubTarget->hasFusion() &&
+        (!MF->getFunction()->optForSize() && !Base.hasOneUse()))
+      continue;
+
     unsigned Flags = 0;
     bool ReplaceFlags = true;
 
@@ -4233,6 +4241,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
+    SDValue ImmOpnd = Base.getOperand(1);
+    int MaxDisplacement = 0;
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+      const GlobalValue *GV = GA->getGlobal();
+      MaxDisplacement = GV->getAlignment() - 1;
+    }
+
+    int Offset = N->getConstantOperandVal(FirstOp);
+    if (Offset < 0 || Offset > MaxDisplacement)
+      continue;
+
     // We found an opportunity.  Reverse the operands from the add
     // immediate and substitute them into the load or store.  If
     // needed, update the target flags for the immediate operand to
@@ -4243,8 +4262,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     DEBUG(N->dump(CurDAG));
     DEBUG(dbgs() << "\n");
 
-    SDValue ImmOpnd = Base.getOperand(1);
-
     // If the relocation information isn't already present on the
     // immediate operand, add it now.
     if (ReplaceFlags) {
@@ -4255,17 +4272,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         // is insufficient for the instruction encoding.
         if (GV->getAlignment() < 4 &&
             (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
-             StorageOpcode == PPC::LWA)) {
+             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
           DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
-        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
         const Constant *C = CP->getConstVal();
         ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
                                                 CP->getAlignment(),
-                                                0, Flags);
+                                                Offset, Flags);
       }
     }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 51c85f6ea673..c17603a7718a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1748,13 +1748,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
     MI->setDesc(NewDesc);
 
     if (NewDesc.ImplicitDefs)
-      for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs();
+      for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
            *ImpDefs; ++ImpDefs)
         if (!MI->definesRegister(*ImpDefs))
           MI->addOperand(*MI->getParent()->getParent(),
                          MachineOperand::CreateReg(*ImpDefs, true, true));
     if (NewDesc.ImplicitUses)
-      for (const uint16_t *ImpUses = NewDesc.getImplicitUses();
+      for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
            *ImpUses; ++ImpUses)
         if (!MI->readsRegister(*ImpUses))
           MI->addOperand(*MI->getParent()->getParent(),
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 2e54ef2ac0e3..0c3788c55c70 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1266,16 +1266,14 @@ def MovesToVSR {
   dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
 }
 
-/*  Direct moves of various widths from VSR's to GPR's. Each moves the
-    respective element out of the VSR and ensures that it is lined up
-    to the right side of the GPR. In addition to the extraction from positions
-    specified by a constant, a pattern for extracting from a variable position
-    is provided. This is useful when the element number is not known at
-    compile time.
+/*  Patterns for extracting elements out of vectors. Integer elements are
+    extracted using direct move operations. Patterns for extracting elements
+    whose indices are not available at compile time are also provided with
+    various _VARIABLE_ patterns.
     The numbering for the DAG's is for LE, but when used on BE, the correct
     LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
 */
-def MovesFromVSR {
+def VectorExtractions {
   // Doubleword extraction
   dag LE_DWORD_0 =
     (MFVSRD
@@ -1371,24 +1369,6 @@ def MovesFromVSR {
   dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
                                          sub_32);
 
-  /*  BE variable byte
-      The algorithm here is the same as the LE variable byte except:
-      - The shift in the VMX register is by 0/8 for opposite element numbers so
-        we simply AND the element number with 0x8
-      - The order of elements after the move to GPR is reversed, so we invert
-        the bits of the index prior to truncating to the range 0-7
-  */
-  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
-  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
-  dag BE_MV_VBYTE = (MFVSRD
-                      (EXTRACT_SUBREG
-                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
-                        sub_64));
-  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
-                                       sub_32);
-  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
-                                         sub_32);
-
   /*  LE variable halfword
       Number 1. above:
       - For elements 0-3, we shift left by 8 since they're on the right
@@ -1421,6 +1401,88 @@ def MovesFromVSR {
   dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
                                          sub_32);
 
+  /*  LE variable word
+      Number 1. above:
+      - For elements 0-1, we shift left by 8 since they're on the right
+      - For elements 2-3, we need not shift
+  */
+  dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-1 (2-3 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 32 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  LE variable doubleword
+      Number 1. above:
+      - For element 0, we shift left by 8 since it's on the right
+      - For element 1, we need not shift
+  */
+  dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+  // Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  //  - Number 4. is not needed for the doubleword as the value is 64-bits
+  dag LE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  LE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+  dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+  dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+  /*  LE variable double
+      Same as the LE doubleword except there is no move.
+  */
+  dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  LE_VDWORD_PERM_VEC);
+  dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+  /*  BE variable byte
+      The algorithm here is the same as the LE variable byte except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x8
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-7
+  */
+  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+  dag BE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+                                         sub_32);
+
   /*  BE variable halfword
       The algorithm here is the same as the LE variable halfword except:
       - The shift in the VMX register is by 0/8 for opposite element numbers so
@@ -1434,10 +1496,54 @@ def MovesFromVSR {
                       (EXTRACT_SUBREG
                         (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
                         sub_64));
-  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 60),
+  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
                                        sub_32);
   dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
                                          sub_32);
+
+  /*  BE variable word
+      The algorithm is the same as the LE variable word except:
+      - The shift in the VMX register happens for opposite element numbers
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-1
+  */
+  dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+  dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+  dag BE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  BE variable doubleword
+      Same as the LE doubleword except we shift in the VMX register for opposite
+      element indices.
+  */
+  dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+  dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  BE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+  dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+  dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+  /* BE variable double
+      Same as the BE doubleword except there is no move.
+  */
+  dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
 }
 
 // v4f32 scalar <-> vector conversions (BE)
@@ -1452,8 +1558,15 @@ let Predicates = [IsBigEndian, HasP8Vector] in {
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
 } // IsBigEndian, HasP8Vector
 
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
 let Predicates = [IsBigEndian, HasDirectMove] in {
   // v16i8 scalar <-> vector conversions (BE)
   def : Pat<(v16i8 (scalar_to_vector i32:$A)),
@@ -1465,75 +1578,79 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 MovesFromVSR.LE_BYTE_15)>;
+            (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 MovesFromVSR.LE_BYTE_14)>;
+            (i32 VectorExtractions.LE_BYTE_14)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 MovesFromVSR.LE_BYTE_13)>;
+            (i32 VectorExtractions.LE_BYTE_13)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 MovesFromVSR.LE_BYTE_12)>;
+            (i32 VectorExtractions.LE_BYTE_12)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 MovesFromVSR.LE_BYTE_11)>;
+            (i32 VectorExtractions.LE_BYTE_11)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 MovesFromVSR.LE_BYTE_10)>;
+            (i32 VectorExtractions.LE_BYTE_10)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 MovesFromVSR.LE_BYTE_9)>;
+            (i32 VectorExtractions.LE_BYTE_9)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 MovesFromVSR.LE_BYTE_8)>;
+            (i32 VectorExtractions.LE_BYTE_8)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 MovesFromVSR.LE_BYTE_7)>;
+            (i32 VectorExtractions.LE_BYTE_7)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 MovesFromVSR.LE_BYTE_6)>;
+            (i32 VectorExtractions.LE_BYTE_6)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 MovesFromVSR.LE_BYTE_5)>;
+            (i32 VectorExtractions.LE_BYTE_5)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 MovesFromVSR.LE_BYTE_4)>;
+            (i32 VectorExtractions.LE_BYTE_4)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 MovesFromVSR.LE_BYTE_3)>;
+            (i32 VectorExtractions.LE_BYTE_3)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 MovesFromVSR.LE_BYTE_2)>;
+            (i32 VectorExtractions.LE_BYTE_2)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 MovesFromVSR.LE_BYTE_1)>;
+            (i32 VectorExtractions.LE_BYTE_1)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 MovesFromVSR.LE_BYTE_0)>;
+            (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 MovesFromVSR.BE_VARIABLE_BYTE)>;
+            (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
 
   // v8i16 scalar <-> vector conversions (BE)
   def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 MovesFromVSR.LE_HALF_7)>;
+            (i32 VectorExtractions.LE_HALF_7)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 MovesFromVSR.LE_HALF_6)>;
+            (i32 VectorExtractions.LE_HALF_6)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 MovesFromVSR.LE_HALF_5)>;
+            (i32 VectorExtractions.LE_HALF_5)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 MovesFromVSR.LE_HALF_4)>;
+            (i32 VectorExtractions.LE_HALF_4)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 MovesFromVSR.LE_HALF_3)>;
+            (i32 VectorExtractions.LE_HALF_3)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 MovesFromVSR.LE_HALF_2)>;
+            (i32 VectorExtractions.LE_HALF_2)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 MovesFromVSR.LE_HALF_1)>;
+            (i32 VectorExtractions.LE_HALF_1)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 MovesFromVSR.LE_HALF_0)>;
+            (i32 VectorExtractions.LE_HALF_0)>;
   def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 MovesFromVSR.BE_VARIABLE_HALF)>;
+            (i32 VectorExtractions.BE_VARIABLE_HALF)>;
 
   // v4i32 scalar <-> vector conversions (BE)
   def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 MovesFromVSR.LE_WORD_3)>;
+            (i32 VectorExtractions.LE_WORD_3)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 MovesFromVSR.LE_WORD_2)>;
+            (i32 VectorExtractions.LE_WORD_2)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 MovesFromVSR.LE_WORD_1)>;
+            (i32 VectorExtractions.LE_WORD_1)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 MovesFromVSR.LE_WORD_0)>;
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_WORD)>;
 
   // v2i64 scalar <-> vector conversions (BE)
   def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 MovesFromVSR.LE_DWORD_1)>;
+            (i64 VectorExtractions.LE_DWORD_1)>;
   def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 MovesFromVSR.LE_DWORD_0)>;
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
 } // IsBigEndian, HasDirectMove
 
 // v4f32 scalar <-> vector conversions (LE)
@@ -1548,8 +1665,15 @@ let Predicates = [IsLittleEndian, HasP8Vector] in {
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
             (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
 } // IsLittleEndian, HasP8Vector
 
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
 let Predicates = [IsLittleEndian, HasDirectMove] in {
   // v16i8 scalar <-> vector conversions (LE)
   def : Pat<(v16i8 (scalar_to_vector i32:$A)),
@@ -1561,73 +1685,77 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 MovesToVSR.LE_DWORD_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 MovesFromVSR.LE_BYTE_0)>;
+            (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 MovesFromVSR.LE_BYTE_1)>;
+            (i32 VectorExtractions.LE_BYTE_1)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 MovesFromVSR.LE_BYTE_2)>;
+            (i32 VectorExtractions.LE_BYTE_2)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 MovesFromVSR.LE_BYTE_3)>;
+            (i32 VectorExtractions.LE_BYTE_3)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 MovesFromVSR.LE_BYTE_4)>;
+            (i32 VectorExtractions.LE_BYTE_4)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 MovesFromVSR.LE_BYTE_5)>;
+            (i32 VectorExtractions.LE_BYTE_5)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 MovesFromVSR.LE_BYTE_6)>;
+            (i32 VectorExtractions.LE_BYTE_6)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 MovesFromVSR.LE_BYTE_7)>;
+            (i32 VectorExtractions.LE_BYTE_7)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 MovesFromVSR.LE_BYTE_8)>;
+            (i32 VectorExtractions.LE_BYTE_8)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 MovesFromVSR.LE_BYTE_9)>;
+            (i32 VectorExtractions.LE_BYTE_9)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 MovesFromVSR.LE_BYTE_10)>;
+            (i32 VectorExtractions.LE_BYTE_10)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 MovesFromVSR.LE_BYTE_11)>;
+            (i32 VectorExtractions.LE_BYTE_11)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 MovesFromVSR.LE_BYTE_12)>;
+            (i32 VectorExtractions.LE_BYTE_12)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 MovesFromVSR.LE_BYTE_13)>;
+            (i32 VectorExtractions.LE_BYTE_13)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 MovesFromVSR.LE_BYTE_14)>;
+            (i32 VectorExtractions.LE_BYTE_14)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 MovesFromVSR.LE_BYTE_15)>;
+            (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 MovesFromVSR.LE_VARIABLE_BYTE)>;
+            (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
 
   // v8i16 scalar <-> vector conversions (LE)
   def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 MovesFromVSR.LE_HALF_0)>;
+            (i32 VectorExtractions.LE_HALF_0)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 MovesFromVSR.LE_HALF_1)>;
+            (i32 VectorExtractions.LE_HALF_1)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 MovesFromVSR.LE_HALF_2)>;
+            (i32 VectorExtractions.LE_HALF_2)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 MovesFromVSR.LE_HALF_3)>;
+            (i32 VectorExtractions.LE_HALF_3)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 MovesFromVSR.LE_HALF_4)>;
+            (i32 VectorExtractions.LE_HALF_4)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 MovesFromVSR.LE_HALF_5)>;
+            (i32 VectorExtractions.LE_HALF_5)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 MovesFromVSR.LE_HALF_6)>;
+            (i32 VectorExtractions.LE_HALF_6)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 MovesFromVSR.LE_HALF_7)>;
+            (i32 VectorExtractions.LE_HALF_7)>;
   def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 MovesFromVSR.LE_VARIABLE_HALF)>;
+            (i32 VectorExtractions.LE_VARIABLE_HALF)>;
 
   // v4i32 scalar <-> vector conversions (LE)
   def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 MovesFromVSR.LE_WORD_0)>;
+            (i32 VectorExtractions.LE_WORD_0)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 MovesFromVSR.LE_WORD_1)>;
+            (i32 VectorExtractions.LE_WORD_1)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 MovesFromVSR.LE_WORD_2)>;
+            (i32 VectorExtractions.LE_WORD_2)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 MovesFromVSR.LE_WORD_3)>;
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_WORD)>;
 
   // v2i64 scalar <-> vector conversions (LE)
   def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 MovesFromVSR.LE_DWORD_0)>;
+            (i64 VectorExtractions.LE_DWORD_0)>;
   def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 MovesFromVSR.LE_DWORD_1)>;
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
 } // IsLittleEndian, HasDirectMove
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 24a9ef0ef077..946e0f10cddd 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -71,6 +71,9 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializePPCBoolRetToIntPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -286,6 +289,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void PPCPassConfig::addIRPasses() {
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCBoolRetToIntPass());
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 0bd2bd84db06..6b19a2f7118b 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -220,6 +220,14 @@ namespace {
         if (OldFMAReg == KilledProdReg)
           continue;
 
+        // If there isn't a class that fits, we can't perform the transform.
+        // This is needed for correctness with a mixture of VSX and Altivec
+        // instructions to make sure that a low VSX register is not assigned to
+        // the Altivec instruction.
+        if (!MRI.constrainRegClass(KilledProdReg,
+                                   MRI.getRegClass(OldFMAReg)))
+          continue;
+
         assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
                "Addend copy not tied to old FMA output!");
 
@@ -262,8 +270,7 @@ namespace {
           if (UseMI == AddendMI)
             continue;
 
-          UseMO.setReg(KilledProdReg);
-          UseMO.setSubReg(KilledProdSubReg);
+          UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
         }
 
         // Extend the live intervals of the killed product operand to hold the
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 85aa983da3a6..e6b5fc8e6235 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -676,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                         LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
 
   unsigned Opcode = MI->getOpcode();
   unsigned NumOps = MI->getNumOperands();
@@ -703,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
     int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
     if (ThreeOperandOpcode >= 0) {
-      MachineInstrBuilder MIB =
-        BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
-        .addOperand(Dest);
+      // Create three address instruction without adding the implicit
+      // operands. Those will instead be copied over from the original
+      // instruction by the loop below.
+      MachineInstrBuilder MIB(*MF,
+                              MF->CreateMachineInstr(get(ThreeOperandOpcode),
+                                    MI->getDebugLoc(), /*NoImplicit=*/true));
+      MIB.addOperand(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
         MIB.addOperand(MI->getOperand(I));
+      MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 22beaad2ab73..f305e85f6cfe 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -16,6 +16,7 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> MISchedPostRA;
 extern "C" void LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
@@ -163,6 +164,16 @@ void SystemZPassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+  // Do final scheduling after all other optimizations, to get an
+  // optimal input for the decoder (branch relaxation must happen
+  // after block placement).
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (MISchedPostRA)
+      addPass(&PostMachineSchedulerID);
+    else
+      addPass(&PostRASchedulerID);
+  }
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 0a81e1f9fdf9..1a8f1f7f3aaa 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -43,6 +43,9 @@ class SystemZTargetMachine : public LLVMTargetMachine {
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool targetSchedulesPostRAScheduling() const override { return true; };
+
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 6a5894958e32..b5177265704c 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -11,6 +11,7 @@ add_public_tablegen_target(WebAssemblyCommonTableGen)
 
 add_llvm_target(WebAssemblyCodeGen
   Relooper.cpp
+  WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
   WebAssemblyFastISel.cpp
@@ -18,6 +19,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyISelDAGToDAG.cpp
   WebAssemblyISelLowering.cpp
   WebAssemblyInstrInfo.cpp
+  WebAssemblyLowerBrUnless.cpp
   WebAssemblyMachineFunctionInfo.cpp
   WebAssemblyMCInstLower.cpp
   WebAssemblyOptimizeReturned.cpp
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 9b94806c9533..3a151dec16f3 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -98,22 +98,7 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case WebAssembly::PARAM:
     case WebAssembly::RESULT:
     case WebAssembly::LOCAL:
-      switch (Op.getImm()) {
-      case MVT::i32:
-        O << "i32";
-        break;
-      case MVT::i64:
-        O << "i64";
-        break;
-      case MVT::f32:
-        O << "f32";
-        break;
-      case MVT::f64:
-        O << "f64";
-        break;
-      default:
-        llvm_unreachable("unexpected type");
-      }
+      O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm()));
       break;
     default:
       O << Op.getImm();
@@ -126,3 +111,18 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     Op.getExpr()->print(O, &MAI);
   }
 }
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::i32:
+    return "i32";
+  case MVT::i64:
+    return "i64";
+  case MVT::f32:
+    return "f32";
+  case MVT::f64:
+    return "f64";
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 319c8ee1d5d9..c585d44f0150 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -16,7 +16,7 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineValueType.h"
 
 namespace llvm {
 
@@ -40,6 +40,12 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
   static const char *getRegisterName(unsigned RegNo);
 };
 
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 6ef01ffade43..b13dd148adf3 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -16,7 +16,6 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include <string>
 
 namespace llvm {
 
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index bfb124d504eb..0e7aa23ac3fe 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -12,6 +12,16 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/AstSemantics.md
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
+The backend is built, tested and archived on the following waterfall:
+  https://build.chromium.org/p/client.wasm.llvm/console
+
+The backend's bringup is done using the GCC torture test suite first since it
+doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite. The tests can be run locally using:
+  github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
+
 Interesting work that remains to be done:
 * Write a pass to restructurize irreducible control flow. This needs to be done
   before register allocation to be efficient, because it may duplicate basic
@@ -38,3 +48,46 @@ expression stack across the jump (sometimes). We should (a) model this, and
 (b) extend the stackifier to utilize it.
 
 //===---------------------------------------------------------------------===//
+
+The min/max operators aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the EXPR_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
+even when they are translated through intrinsics.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 001f9f9d4a72..f8d0a1ccdc2e 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -27,11 +27,13 @@ FunctionPass *createWebAssemblyOptimizeReturned();
 
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
 
 FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyPeephole();
 
diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index 53dd9290348a..551ad9345154 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is a target description file for the WebAssembly architecture, which is
-// also known as "wasm".
-//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 000000000000..3893c408cf63
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,110 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Argument Move";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+  return new WebAssemblyArgumentMove();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Argument Move **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+  // Look for the first NonArg instruction.
+  for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
+    MachineInstr *MI = MII;
+    if (!IsArgument(MI)) {
+      InsertPt = MII;
+      break;
+    }
+  }
+
+  // Now move any argument instructions later in the block
+  // to before our first NonArg instruction.
+  for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    if (IsArgument(MI)) {
+      EntryMBB.insert(InsertPt, MI->removeFromParent());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index cfabc21ea4d9..fc2928079d2c 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -57,10 +57,6 @@ class WebAssemblyAsmPrinter final : public AsmPrinter {
   // MachineFunctionPass Implementation.
   //===------------------------------------------------------------------===//
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AsmPrinter::getAnalysisUsage(AU);
-  }
-
   bool runOnMachineFunction(MachineFunction &MF) override {
     MRI = &MF.getRegInfo();
     MFI = MF.getInfo<WebAssemblyFunctionInfo>();
@@ -115,21 +111,7 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
 }
 
 const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
-  switch (VT.SimpleTy) {
-  default:
-    break;
-  case MVT::f32:
-    return "f32";
-  case MVT::f64:
-    return "f64";
-  case MVT::i32:
-    return "i32";
-  case MVT::i64:
-    return "i64";
-  }
-  DEBUG(dbgs() << "Invalid type " << EVT(VT).getEVTString() << '\n');
-  llvm_unreachable("invalid type");
-  return "<invalid>";
+  return WebAssembly::TypeToString(VT);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2485df1ab5d2..4ef0846d311a 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -118,6 +118,13 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
   setOperationAction(ISD::JumpTable, MVTPtr, Custom);
 
+  // Take the default expansion for va_arg, va_copy, and va_end. There is no
+  // default action for va_start, so we do that custom.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
   for (auto T : {MVT::f32, MVT::f64}) {
     // Don't expand the floating-point types to constant pools.
     setOperationAction(ISD::ConstantFP, T, Legal);
@@ -126,7 +133,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                     ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
       setCondCodeAction(CC, T, Expand);
     // Expand floating-point library function operators.
-    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW})
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+                    ISD::FREM, ISD::FMA})
       setOperationAction(Op, T, Expand);
     // Note supported floating-point library function operators that otherwise
     // default to expand.
@@ -151,7 +159,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
-  for (auto T : {MVT::i1, MVT::i8, MVT::i16})
+  for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
     setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
 
   // Dynamic stack allocation: use the default expansion.
@@ -171,7 +179,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   //  - Floating-point extending loads.
   //  - Floating-point truncating stores.
   //  - i1 extending loads.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   for (auto T : MVT::integer_valuetypes())
     for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
@@ -188,14 +196,19 @@ FastISel *WebAssemblyTargetLowering::createFastISel(
 
 bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode * /*GA*/) const {
-  // The WebAssembly target doesn't support folding offsets into global
-  // addresses.
-  return false;
+  // All offsets can be folded.
+  return true;
 }
 
 MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
                                                       EVT VT) const {
-  return VT.getSimpleVT();
+  unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+  if (BitWidth > 1 && BitWidth < 8)
+    BitWidth = 8;
+  MVT Result = MVT::getIntegerVT(BitWidth);
+  assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+         "Unable to represent scalar shift amount type");
+  return Result;
 }
 
 const char *
@@ -220,10 +233,13 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      if (VT == MVT::i32)
-        return std::make_pair(0U, &WebAssembly::I32RegClass);
-      if (VT == MVT::i64)
-        return std::make_pair(0U, &WebAssembly::I64RegClass);
+      assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+      if (VT.isInteger() && !VT.isVector()) {
+        if (VT.getSizeInBits() <= 32)
+          return std::make_pair(0U, &WebAssembly::I32RegClass);
+        if (VT.getSizeInBits() <= 64)
+          return std::make_pair(0U, &WebAssembly::I64RegClass);
+      }
       break;
     default:
       break;
@@ -257,6 +273,19 @@ static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
       DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
 }
 
+// Test whether the given calling convention is supported.
+static bool CallingConvSupported(CallingConv::ID CallConv) {
+  // We currently support the language-independent target-independent
+  // conventions. We don't yet have a way to annotate calls with properties like
+  // "cold", and we don't have any call-clobbered registers, so these are mostly
+  // all handled the same.
+  return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+         CallConv == CallingConv::Cold ||
+         CallConv == CallingConv::PreserveMost ||
+         CallConv == CallingConv::PreserveAll ||
+         CallConv == CallingConv::CXX_FAST_TLS;
+}
+
 SDValue
 WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                      SmallVectorImpl<SDValue> &InVals) const {
@@ -267,8 +296,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
 
   CallingConv::ID CallConv = CLI.CallConv;
-  if (CallConv != CallingConv::C && CallConv != CallingConv::Fast &&
-      CallConv != CallingConv::Cold)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG,
          "WebAssembly doesn't support language-specific or target-specific "
          "calling conventions yet");
@@ -289,28 +317,97 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (Ins.size() > 1)
     fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
 
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  for (const ISD::OutputArg &Out : Outs) {
+    if (Out.Flags.isByVal())
+      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    if (Out.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+  }
+
   bool IsVarArg = CLI.IsVarArg;
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
+  unsigned NumFixedArgs = CLI.NumFixedArgs;
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  auto PtrVT = getPointerTy(MF.getDataLayout());
-  auto Zero = DAG.getConstant(0, DL, PtrVT, true);
+  if (IsVarArg) {
+    // Outgoing non-fixed arguments are placed at the top of the stack. First
+    // compute their offsets and the total amount of argument stack space
+    // needed.
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      EVT VT = Arg.getValueType();
+      assert(VT != MVT::iPTR && "Legalized args should be concrete");
+      Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+      unsigned Offset =
+          CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
+                               MF.getDataLayout().getABITypeAlignment(Ty));
+      CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+                                        Offset, VT.getSimpleVT(),
+                                        CCValAssign::Full));
+    }
+  }
+
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
   auto NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
   Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
 
+  if (IsVarArg) {
+    // For non-fixed arguments, next emit stores to store the argument values
+    // to the stack at the offsets computed above.
+    SDValue SP = DAG.getCopyFromReg(
+        Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+    unsigned ValNo = 0;
+    SmallVector<SDValue, 8> Chains;
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      assert(ArgLocs[ValNo].getValNo() == ValNo &&
+             "ArgLocs should remain in order and only hold varargs args");
+      unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+                                DAG.getConstant(Offset, DL, PtrVT));
+      Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
+                                    MachinePointerInfo::getStack(MF, Offset),
+                                    false, false, 0));
+    }
+    if (!Chains.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  // Compute the operands for the CALLn node.
   SmallVector<SDValue, 16> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
-  Ops.append(OutVals.begin(), OutVals.end());
+
+  // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+  // isn't reliable.
+  Ops.append(OutVals.begin(),
+             IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
 
   SmallVector<EVT, 8> Tys;
-  for (const auto &In : Ins)
+  for (const auto &In : Ins) {
+    assert(!In.Flags.isByVal() && "byval is not valid for return values");
+    assert(!In.Flags.isNest() && "nest is not valid for return values");
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last return values");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
     Tys.push_back(In.VT);
+  }
   Tys.push_back(MVT::Other);
   SDVTList TyList = DAG.getVTList(Tys);
   SDValue Res =
@@ -323,7 +420,8 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = Res.getValue(1);
   }
 
-  Chain = DAG.getCALLSEQ_END(Chain, NB, Zero, SDValue(), DL);
+  SDValue Unused = DAG.getUNDEF(PtrVT);
+  Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
 
   return Chain;
 }
@@ -337,15 +435,13 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
-    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
     SelectionDAG &DAG) const {
   assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
-  if (CallConv != CallingConv::C)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
 
   SmallVector<SDValue, 4> RetOps(1, Chain);
   RetOps.append(OutVals.begin(), OutVals.end());
@@ -355,29 +451,26 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
   for (const ISD::OutputArg &Out : Outs) {
     assert(!Out.Flags.isByVal() && "byval is not valid for return values");
     assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    assert(Out.IsFixed && "non-fixed return value is not valid");
     if (Out.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
     if (Out.Flags.isInConsecutiveRegs())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
     if (Out.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
-    if (!Out.IsFixed)
-      fail(DL, DAG, "WebAssembly doesn't support non-fixed results yet");
   }
 
   return Chain;
 }
 
 SDValue WebAssemblyTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
-  if (CallConv != CallingConv::C)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
 
   // Set up the incoming ARGUMENTS value, which serves to represent the liveness
   // of the incoming values before they're represented by virtual registers.
@@ -400,12 +493,15 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
         In.Used
             ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
                           DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
-            : DAG.getNode(ISD::UNDEF, DL, In.VT));
+            : DAG.getUNDEF(In.VT));
 
     // Record the number and types of arguments.
     MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
   }
 
+  // Incoming varargs arguments are on the stack and will be accessed through
+  // va_arg, so we don't need to do anything for them here.
+
   return Chain;
 }
 
@@ -427,6 +523,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
     return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
   }
 }
 
@@ -435,13 +533,12 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
   SDLoc DL(Op);
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   EVT VT = Op.getValueType();
-  assert(GA->getOffset() == 0 &&
-         "offsets on global addresses are forbidden by isOffsetFoldingLegal");
   assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
   if (GA->getAddressSpace() != 0)
     fail(DL, DAG, "WebAssembly only expects the 0 address space");
   return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
-                     DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT));
+                     DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT,
+                                                GA->getOffset()));
 }
 
 SDValue
@@ -492,6 +589,23 @@ SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
   return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
 }
 
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+  // The incoming non-fixed arguments are placed on the top of the stack, with
+  // natural alignment, at the point of the call, so the base pointer is just
+  // the current frame pointer.
+  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+  unsigned FP =
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
 //===----------------------------------------------------------------------===//
 //                          WebAssembly Optimization Hooks
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index af5eab671f27..b6b54bb13ea6 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -77,6 +77,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 4028460bd231..0587c0b6613e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -12,20 +12,22 @@
 ///
 //===----------------------------------------------------------------------===//
 
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
 let Defs = [ARGUMENTS] in {
 
-// The call sequence start/end LLVM-isms isn't useful to WebAssembly since it's
-// a virtual ISA.
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
 let isCodeGenOnly = 1 in {
-def : I<(outs), (ins i64imm:$amt),
-        [(WebAssemblycallseq_start timm:$amt)]>;
-def : I<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-        [(WebAssemblycallseq_end timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
+                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt),
+                       [(WebAssemblycallseq_end timm:$amt, undef)]>;
 } // isCodeGenOnly = 1
 
 multiclass CALL<WebAssemblyRegClass vt> {
-  def CALL_#vt : I<(outs vt:$dst), (ins global:$callee, variable_ops),
-                   [(set vt:$dst, (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee)))],
+  def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops),
+                   [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
                    "call    \t$dst, $callee">;
   def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
                             [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
@@ -37,8 +39,8 @@ let Uses = [SP32, SP64], isCall = 1 in {
   defm : CALL<F32>;
   defm : CALL<F64>;
 
-  def CALL_VOID : I<(outs), (ins global:$callee, variable_ops),
-                    [(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee))],
+  def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
+                    [(WebAssemblycall0 (i32 imm:$callee))],
                     "call    \t$callee">;
   def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
                              [(WebAssemblycall0 I32:$callee)],
@@ -46,3 +48,27 @@ let Uses = [SP32, SP64], isCall = 1 in {
 } // Uses = [SP32,SP64], isCall = 1
 
 } // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+          (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F64 texternalsym:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+          (CALL_VOID texternalsym:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 840f7d669314..9a9468bb3909 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -15,9 +15,13 @@
 let Defs = [ARGUMENTS] in {
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
-def BR_IF : I<(outs), (ins I32:$a, bb_op:$dst),
-              [(brcond I32:$a, bb:$dst)],
-               "br_if   \t$a, $dst">;
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+              [(brcond I32:$cond, bb:$dst)],
+               "br_if   \t$cond, $dst">;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
+                   "br_unless\t$cond, $dst">;
 let isBarrier = 1 in {
 def BR   : I<(outs), (ins bb_op:$dst),
              [(br bb:$dst)],
@@ -25,16 +29,25 @@ def BR   : I<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+          (BR_IF I32:$cond, bb_op:$dst)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+          (BR_UNLESS I32:$cond, bb_op:$dst)>;
+
+let Defs = [ARGUMENTS] in {
+
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
 // currently.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def TABLESWITCH_I32 : I<(outs), (ins I32:$index, variable_ops),
-                        [(WebAssemblytableswitch I32:$index)],
-                        "tableswitch\t$index">;
-def TABLESWITCH_I64 : I<(outs), (ins I64:$index, variable_ops),
-                        [(WebAssemblytableswitch I64:$index)],
-                        "tableswitch\t$index">;
+def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I32:$index, bb:$default)],
+                        "tableswitch\t$index, $default">;
+def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I64:$index, bb:$default)],
+                        "tableswitch\t$index, $default">;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
 // Placemarkers to indicate the start of a block or loop scope.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 4926b8fd8725..931f4a913d0f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -26,6 +26,15 @@ def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
                          [(set I64:$dst, (zext I32:$src))],
                          "i64.extend_u/i32\t$dst, $src">;
 
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
 // Conversion from floating point to integer traps on overflow and invalid.
 let hasSideEffects = 1 in {
 def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index d966380e6b0c..5520c6de6732 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -38,6 +38,12 @@ defm NEAREST : UnaryFP<fnearbyint, "nearest">;
 
 } // Defs = [ARGUMENTS]
 
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+          (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+          (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
 // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 62a108c0d400..8008dd32353a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -1,4 +1,4 @@
-// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-//
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 82296b3cdace..3b219f4a901a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -28,7 +28,9 @@ using namespace llvm;
 #include "WebAssemblyGenInstrInfo.inc"
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : RI(STI.getTargetTriple()) {}
+    : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+                              WebAssembly::ADJCALLSTACKUP),
+      RI(STI.getTargetTriple()) {}
 
 void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
@@ -69,6 +71,15 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     case WebAssembly::BR_IF:
       if (HaveCond)
         return true;
+      Cond.push_back(MachineOperand::CreateImm(true));
+      Cond.push_back(MI.getOperand(0));
+      TBB = MI.getOperand(1).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR_UNLESS:
+      if (HaveCond)
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(false));
       Cond.push_back(MI.getOperand(0));
       TBB = MI.getOperand(1).getMBB();
       HaveCond = true;
@@ -111,8 +122,6 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                             MachineBasicBlock *FBB,
                                             ArrayRef<MachineOperand> Cond,
                                             DebugLoc DL) const {
-  assert(Cond.size() <= 1);
-
   if (Cond.empty()) {
     if (!TBB)
       return 0;
@@ -121,7 +130,17 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
     return 1;
   }
 
-  BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addOperand(Cond[0]).addMBB(TBB);
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+  if (Cond[0].getImm()) {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  } else {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  }
   if (!FBB)
     return 1;
 
@@ -131,10 +150,7 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
 
 bool WebAssemblyInstrInfo::ReverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 1);
-
-  // TODO: Add branch reversal here... And re-enable MachineBlockPlacementID
-  // when we do.
-
-  return true;
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+  Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+  return false;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index caffac1bc52b..5cf8664eba8d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -67,8 +67,6 @@ def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
 //===----------------------------------------------------------------------===//
 
 def bb_op : Operand<OtherVT>;
-def tjumptable_op : Operand<iPTR>;
-def global : Operand<iPTR>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
@@ -136,13 +134,11 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)),
           (CONST_I32 tglobaladdr:$dst)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)),
           (CONST_I32 texternalsym:$dst)>;
+def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)),
+          (CONST_I32 tjumptable:$dst)>;
 
 let Defs = [ARGUMENTS] in {
 
-def JUMP_TABLE : I<(outs I32:$dst), (ins tjumptable_op:$addr),
-                   [(set I32:$dst, (WebAssemblywrapper tjumptable:$addr))],
-                   "jump_table\t$dst, $addr">;
-
 // Function signature and local variable declaration "instructions".
 def PARAM  : I<(outs), (ins variable_ops), [], ".param  \t">;
 def RESULT : I<(outs), (ins variable_ops), [], ".result \t">;
@@ -158,7 +154,7 @@ include "WebAssemblyInstrMemory.td"
 include "WebAssemblyInstrCall.td"
 include "WebAssemblyInstrControl.td"
 include "WebAssemblyInstrInteger.td"
-include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 700a196fa29c..fbb3df2f7b82 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -25,59 +25,63 @@
 let Defs = [ARGUMENTS] in {
 
 // Basic load.
-def LOAD_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                 [(set I32:$dst, (load I32:$addr))],
-                 "i32.load\t$dst, $addr">;
-def LOAD_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                 [(set I64:$dst, (load I32:$addr))],
-                 "i64.load\t$dst, $addr">;
-def LOAD_F32 : I<(outs F32:$dst), (ins I32:$addr),
-                 [(set F32:$dst, (load I32:$addr))],
-                 "f32.load\t$dst, $addr">;
-def LOAD_F64 : I<(outs F64:$dst), (ins I32:$addr),
-                 [(set F64:$dst, (load I32:$addr))],
-                 "f64.load\t$dst, $addr">;
+def LOAD_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                 "i32.load\t$dst, $off($addr)">;
+def LOAD_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                 "i64.load\t$dst, $off($addr)">;
+def LOAD_F32 : I<(outs F32:$dst), (ins I32:$off, I32:$addr), [],
+                 "f32.load\t$dst, $off($addr)">;
+def LOAD_F64 : I<(outs F64:$dst), (ins I32:$off, I32:$addr), [],
+                 "f64.load\t$dst, $off($addr)">;
 
 // Extending load.
-def LOAD8_S_I32  : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (sextloadi8 I32:$addr))],
-                     "i32.load8_s\t$dst, $addr">;
-def LOAD8_U_I32  : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (zextloadi8 I32:$addr))],
-                     "i32.load8_u\t$dst, $addr">;
-def LOAD16_S_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (sextloadi16 I32:$addr))],
-                     "i32.load16_s\t$dst, $addr">;
-def LOAD16_U_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (zextloadi16 I32:$addr))],
-                     "i32.load16_u\t$dst, $addr">;
-def LOAD8_S_I64  : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi8 I32:$addr))],
-                     "i64.load8_s\t$dst, $addr">;
-def LOAD8_U_I64  : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi8 I32:$addr))],
-                     "i64.load8_u\t$dst, $addr">;
-def LOAD16_S_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi16 I32:$addr))],
-                     "i64.load16_s\t$dst, $addr">;
-def LOAD16_U_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi16 I32:$addr))],
-                     "i64.load16_u\t$dst, $addr">;
-def LOAD32_S_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi32 I32:$addr))],
-                     "i64.load32_s\t$dst, $addr">;
-def LOAD32_U_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi32 I32:$addr))],
-                     "i64.load32_u\t$dst, $addr">;
+def LOAD8_S_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load8_s\t$dst, $off($addr)">;
+def LOAD8_U_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load8_u\t$dst, $off($addr)">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load16_s\t$dst, $off($addr)">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load16_u\t$dst, $off($addr)">;
+def LOAD8_S_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load8_s\t$dst, $off($addr)">;
+def LOAD8_U_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load8_u\t$dst, $off($addr)">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load16_s\t$dst, $off($addr)">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load16_u\t$dst, $off($addr)">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load32_s\t$dst, $off($addr)">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load32_u\t$dst, $off($addr)">;
 
 } // Defs = [ARGUMENTS]
 
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
 // "Don't care" extending load become zero-extending load.
-def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 $addr)>;
-def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 $addr)>;
-def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 $addr)>;
-def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 $addr)>;
-def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 $addr)>;
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -87,48 +91,55 @@ let Defs = [ARGUMENTS] in {
 // instruction definition patterns that don't reference all of the output
 // operands.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32  : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                   "i32.store\t$dst, $addr, $val">;
-def STORE_I64  : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                   "i64.store\t$dst, $addr, $val">;
-def STORE_F32  : I<(outs F32:$dst), (ins I32:$addr, F32:$val), [],
-                   "f32.store\t$dst, $addr, $val">;
-def STORE_F64  : I<(outs F64:$dst), (ins I32:$addr, F64:$val), [],
-                   "f64.store\t$dst, $addr, $val">;
+def STORE_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                   "i32.store\t$dst, $off($addr), $val">;
+def STORE_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                   "i64.store\t$dst, $off($addr), $val">;
+def STORE_F32  : I<(outs F32:$dst), (ins I32:$off, I32:$addr, F32:$val), [],
+                   "f32.store\t$dst, $off($addr), $val">;
+def STORE_F64  : I<(outs F64:$dst), (ins I32:$off, I32:$addr, F64:$val), [],
+                   "f64.store\t$dst, $off($addr), $val">;
 
 } // Defs = [ARGUMENTS]
 
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+
+// FIXME: This pattern matches an immediate to actually use the offset field
+// in the store instruction; however only unsigned offsets are supported in
+// wasm, so we need to constrain the immediate we match. This may require
+// custom code rather than a simple pattern.
+// def : Pat<(store I32:$val, (add I32:$addr, (i32 imm:$off))),
+//           (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Truncating store.
-def STORE8_I32  : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                    "i32.store8\t$dst, $addr, $val">;
-def STORE16_I32 : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                    "i32.store16\t$dst, $addr, $val">;
-def STORE8_I64  : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store8\t$dst, $addr, $val">;
-def STORE16_I64 : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store16\t$dst, $addr, $val">;
-def STORE32_I64 : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store32\t$dst, $addr, $val">;
+def STORE8_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                    "i32.store8\t$dst, $off($addr), $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                    "i32.store16\t$dst, $off($addr), $val">;
+def STORE8_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store8\t$dst, $off($addr), $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store16\t$dst, $off($addr), $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store32\t$dst, $off($addr), $val">;
 
 } // Defs = [ARGUMENTS]
 
 def : Pat<(truncstorei8 I32:$val, I32:$addr),
-          (STORE8_I32 I32:$addr, I32:$val)>;
+          (STORE8_I32 0, I32:$addr, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, I32:$addr),
-          (STORE16_I32 I32:$addr, I32:$val)>;
+          (STORE16_I32 0, I32:$addr, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, I32:$addr),
-          (STORE8_I64 I32:$addr, I64:$val)>;
+          (STORE8_I64 0, I32:$addr, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, I32:$addr),
-          (STORE16_I64 I32:$addr, I64:$val)>;
+          (STORE16_I64 0, I32:$addr, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, I32:$addr),
-          (STORE32_I64 I32:$addr, I64:$val)>;
+          (STORE32_I64 0, I32:$addr, I64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 000000000000..846f6eb1e5cf
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,133 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Lower br_unless";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+  return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  for (auto &MBB : MF) {
+    for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+      MachineInstr *MI = &*MII++;
+      if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+        continue;
+
+      unsigned Cond = MI->getOperand(0).getReg();
+      bool Inverted = false;
+
+      // Attempt to invert the condition in place.
+      if (MFI.isVRegStackified(Cond)) {
+        assert(MRI.hasOneDef(Cond));
+        MachineInstr *Def = MRI.getVRegDef(Cond);
+        switch (Def->getOpcode()) {
+        using namespace WebAssembly;
+        case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+        case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+        case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+        case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+        case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+        case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+        case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+        case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+        case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+        case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+        case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+        case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+        case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+        case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+        case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+        case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+        case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+        case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+        case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+        case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+        case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+        case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+        case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+        case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+        default: break;
+        }
+      }
+
+      // If we weren't able to invert the condition in place. Insert an
+      // expression to invert it.
+      if (!Inverted) {
+        unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(ZeroReg);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
+            .addImm(0);
+        unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(Tmp);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
+            .addReg(Cond)
+            .addReg(ZeroReg);
+        Cond = Tmp;
+        Inverted = true;
+      }
+
+      // The br_unless condition has now been inverted. Insert a br_if and
+      // delete the br_unless.
+      assert(Inverted);
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+          .addReg(Cond)
+          .addMBB(MI->getOperand(1).getMBB());
+      MBB.erase(MI);
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index dd9358035a88..2d2adeb93d2d 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -39,11 +39,16 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                      MCSymbol *Sym) const {
+  assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags");
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
-    llvm_unreachable("unknown symbol op");
+  int64_t Offset = MO.getOffset();
+  if (Offset != 0) {
+    assert(!MO.isJTI() && "Unexpected offset with jump table index");
+    Expr =
+        MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  }
 
   return MCOperand::createExpr(Expr);
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 62c5f33cfad7..af4dabb2c6c3 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -16,8 +16,6 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 
-#include "WebAssemblyRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index dea419c5975c..4dc401a2c7cc 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -57,6 +57,9 @@ void OptimizeReturned::visitCallSite(CallSite CS) {
     if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
       Instruction *Inst = CS.getInstruction();
       Value *Arg = CS.getArgOperand(i);
+      // Ignore constants, globals, undef, etc.
+      if (isa<Constant>(Arg))
+        continue;
       // Like replaceDominatedUsesWith but using Instruction/Use dominance.
       for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
         Use &U = *UI++;
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index e149d9cce719..ab539e1c2870 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -26,6 +26,11 @@ class WebAssemblyPeephole final : public MachineFunctionPass {
     return "WebAssembly late peephole optimizer";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 public:
@@ -64,7 +69,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
         // can use $discard instead.
         MachineOperand &MO = MI.getOperand(0);
         unsigned OldReg = MO.getReg();
-        if (OldReg == MI.getOperand(2).getReg()) {
+        if (OldReg == MI.getOperand(3).getReg()) {
+          Changed = true;
           unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
           MO.setReg(NewReg);
           MO.setIsDead();
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index bdccc8577c5e..d890310ac501 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -15,7 +15,7 @@
 /// are then marked as "stackified", meaning references to them are replaced by
 /// "push" and "pop" from the stack.
 ///
-/// This is primarily a code size optimiation, since temporary values on the
+/// This is primarily a code size optimization, since temporary values on the
 /// expression don't need to be named.
 ///
 //===----------------------------------------------------------------------===//
@@ -61,15 +61,41 @@ FunctionPass *llvm::createWebAssemblyRegStackify() {
 }
 
 // Decorate the given instruction with implicit operands that enforce the
-// expression stack ordering constraints.
-static void ImposeStackOrdering(MachineInstr *MI) {
-  // Read and write the opaque EXPR_STACK register.
-  MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
-                                           /*isDef=*/true,
-                                           /*isImp=*/true));
+// expression stack ordering constraints needed for an instruction which is
+// consumed by an instruction using the expression stack.
+static void ImposeStackInputOrdering(MachineInstr *MI) {
+  // Write the opaque EXPR_STACK register.
+  if (!MI->definesRegister(WebAssembly::EXPR_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+                                             /*isDef=*/true,
+                                             /*isImp=*/true));
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI, MachineRegisterInfo &MRI) {
+  ImposeStackInputOrdering(MI);
+
+  // Also read the opaque EXPR_STACK register.
   MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
                                            /*isDef=*/false,
                                            /*isImp=*/true));
+
+  // Also, mark any inputs to this instruction as being consumed by an
+  // instruction on the expression stack.
+  // TODO: Find a lighter way to describe the appropriate constraints.
+  for (MachineOperand &MO : MI->uses()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    if (Def->getOpcode() == TargetOpcode::PHI)
+      continue;
+    ImposeStackInputOrdering(Def);
+  }
 }
 
 // Test whether it's safe to move Def to just before Insert. Note that this
@@ -81,6 +107,7 @@ static void ImposeStackOrdering(MachineInstr *MI) {
 // more precise.
 static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
                          AliasAnalysis &AA) {
+  assert(Def->getParent() == Insert->getParent());
   bool SawStore = false, SawSideEffects = false;
   MachineBasicBlock::const_iterator D(Def), I(Insert);
   for (--I; I != D; --I)
@@ -100,6 +127,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 
+  assert(MRI.isSSA() && "RegStackify depends on SSA form");
+
   // Walk the instructions from the bottom up. Currently we don't look past
   // block boundaries, and the blocks aren't ordered so the block visitation
   // order isn't significant, but we may want to change this in the future.
@@ -125,8 +154,15 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           continue;
 
         unsigned Reg = Op.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          // An instruction with a physical register. Conservatively mark it as
+          // an expression stack input so that it isn't reordered with anything
+          // in an expression stack which might use it (physical registers
+          // aren't in SSA form so it's not trivial to determine this).
+          // TODO: Be less conservative.
+          ImposeStackInputOrdering(Insert);
           continue;
+        }
 
         // Only consider registers with a single definition.
         // TODO: Eventually we may relax this, to stackify phi transfers.
@@ -155,17 +191,15 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
             Def->getOpcode() == WebAssembly::ARGUMENT_F64)
           continue;
 
-        // Single-use expression trees require defs that have one use, or that
-        // they be trivially clonable.
+        // Single-use expression trees require defs that have one use.
         // TODO: Eventually we'll relax this, to take advantage of set_local
         // returning its result.
         if (!MRI.hasOneUse(Reg))
           continue;
 
-        // For now, be conservative and don't look across block boundaries,
-        // unless we have something trivially clonable.
+        // For now, be conservative and don't look across block boundaries.
         // TODO: Be more aggressive.
-        if (Def->getParent() != &MBB && !Def->isMoveImmediate())
+        if (Def->getParent() != &MBB)
           continue;
 
         // Don't move instructions that have side effects or memory dependencies
@@ -179,11 +213,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         MBB.insert(MachineBasicBlock::instr_iterator(Insert),
                    Def->removeFromParent());
         MFI.stackifyVReg(Reg);
-        ImposeStackOrdering(Def);
+        ImposeStackOrdering(Def, MRI);
         Insert = Def;
       }
       if (AnyStackified)
-        ImposeStackOrdering(&MI);
+        ImposeStackOrdering(&MI, MRI);
     }
   }
 
@@ -205,6 +239,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           continue;
         unsigned VReg = MO.getReg();
 
+        // Don't stackify physregs like SP or FP.
+        if (!TargetRegisterInfo::isVirtualRegister(VReg))
+          continue;
+
         if (MFI.isVRegStackified(VReg)) {
           if (MO.isDef())
             Stack.push_back(VReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 6c74098aff10..f87b547e3f57 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -67,3 +67,12 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
   return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
 }
+
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                            unsigned Kind) const {
+  assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+  if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+    return &WebAssembly::I64RegClass;
+  return &WebAssembly::I32RegClass;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index bd1485317160..ad1d71eebf22 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -41,6 +41,10 @@ class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 3a7f50e3b142..650143a94095 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -10,9 +10,12 @@
 /// \file
 /// \brief This file implements an optimization pass using store result values.
 ///
-/// WebAssembly's store instructions return the stored value, specifically to
-/// enable the optimization of reducing get_local/set_local traffic, which is
-/// what we're doing here.
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -68,6 +71,9 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
+
+  assert(MRI.isSSA() && "StoreResults depends on SSA form");
 
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
@@ -85,22 +91,31 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I32:
       case WebAssembly::STORE_I64:
         unsigned ToReg = MI.getOperand(0).getReg();
-        unsigned FromReg = MI.getOperand(2).getReg();
+        unsigned FromReg = MI.getOperand(3).getReg();
         for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
           MachineOperand &O = *I++;
           MachineInstr *Where = O.getParent();
-          if (Where->getOpcode() == TargetOpcode::PHI)
-            Where = Where->getOperand(&O - &Where->getOperand(0) + 1)
-                        .getMBB()
-                        ->getFirstTerminator();
-          if (&MI == Where || !MDT.dominates(&MI, Where))
-            continue;
-          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where <<
-                " from " << MI <<"\n");
+          if (Where->getOpcode() == TargetOpcode::PHI) {
+            // PHIs use their operands on their incoming CFG edges rather than
+            // in their parent blocks. Get the basic block paired with this use
+            // of FromReg and check that MI's block dominates it.
+            MachineBasicBlock *Pred =
+                Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
+            if (!MDT.dominates(&MBB, Pred))
+              continue;
+          } else {
+            // For a non-PHI, check that MI dominates the instruction in the
+            // normal way.
+            if (&MI == Where || !MDT.dominates(&MI, Where))
+              continue;
+          }
+          Changed = true;
+          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
+                       << " from " << MI << "\n");
           O.setReg(ToReg);
         }
       }
   }
 
-  return true;
+  return Changed;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 3d9e7aacbfbf..cb2d5a63a19f 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
       TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 9b17300e497d..f530a290fa0e 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -69,7 +69,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   }
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override;
-  bool useAA() const override { return true; }
+  bool useAA() const override;
 
   // Predicates used by WebAssemblyInstrInfo.td.
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 29fb89c6e18f..a333fb3055e9 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -146,14 +146,24 @@ void WebAssemblyPassConfig::addIRPasses() {
 }
 
 bool WebAssemblyPassConfig::addInstSelector() {
+  (void)TargetPassConfig::addInstSelector();
   addPass(
       createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+  // Run the argument-move pass immediately after the ScheduleDAG scheduler
+  // so that we can fix up the ARGUMENT instructions before anything else
+  // sees them in the wrong place.
+  addPass(createWebAssemblyArgumentMove());
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() { return true; }
+bool WebAssemblyPassConfig::addILPOpts() {
+  (void)TargetPassConfig::addILPOpts();
+  return true;
+}
 
 void WebAssemblyPassConfig::addPreRegAlloc() {
+  TargetPassConfig::addPreRegAlloc();
+
   // Prepare store instructions for register stackifying.
   addPass(createWebAssemblyStoreResults());
 
@@ -171,18 +181,21 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
   // Fails with: should be run after register allocation.
   disablePass(&MachineCopyPropagationID);
 
-  // TODO: Until we get ReverseBranchCondition support, MachineBlockPlacement
-  // can create ugly-looking control flow.
-  disablePass(&MachineBlockPlacementID);
-
   // Run the register coloring pass to reduce the total number of registers.
   addPass(createWebAssemblyRegColoring());
+
+  TargetPassConfig::addPostRegAlloc();
 }
 
 void WebAssemblyPassConfig::addPreEmitPass() {
+  TargetPassConfig::addPreEmitPass();
+    
   // Put the CFG in structured form; insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
+  // Lower br_unless into br_if.
+  addPass(createWebAssemblyLowerBrUnless());
+
   // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
   addPass(createWebAssemblyRegNumbering());
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index ea7044d58834..356631711921 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,8 +25,3 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }
-
-bool WebAssemblyTTIImpl::haveFastSqrt(Type *Ty) const {
-  assert(Ty->isFPOrFPVectorTy() && "Ty must be floating point");
-  return true;
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 84f9f0e3e55e..26dc388cc922 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -55,7 +55,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   // TODO: Implement more Scalar TTI for WebAssembly
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
-  bool haveFastSqrt(Type *Ty) const;
 
   /// @}
 
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 000000000000..cb6f28dd6fde
--- /dev/null
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,417 @@
+# Tests which are known to fail from the GCC torture test suite.
+
+# Core dump.
+920908-1.c
+pr38151.c
+va-arg-22.c
+
+# ValueTypes.h:222: llvm::MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && "Expected a SimpleValueType!"' failed.
+loop-12.c
+
+
+
+# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
+20021120-3.c
+20070201-1.c
+920501-9.c
+930513-1.c
+941014-2.c
+980605-1.c
+fprintf-1.c
+gofast.c
+pr41239.c
+printf-1.c
+struct-ret-1.c
+va-arg-11.c
+va-arg-21.c
+va-arg-24.c
+va-arg-trap-1.c
+
+# LiveInterval.cpp:1092: void llvm::LiveRange::verify() const: Assertion `I->end <= std::next(I)->start' failed.
+20020406-1.c
+
+
+# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
+20000815-1.c
+20010129-1.c
+20120427-1.c
+20120427-2.c
+930628-1.c
+980707-1.c
+990524-1.c
+arith-rand-ll.c
+arith-rand.c
+builtin-bitops-1.c
+memcpy-2.c
+memset-1.c
+memset-3.c
+pending-4.c
+pr20601-1.c
+pr34415.c
+pr49073.c
+pr53465.c
+strcpy-1.c
+switch-1.c
+
+# WebAssemblyCFGStackify.cpp:374: virtual bool {anonymous}::WebAssemblyCFGStackify::runOnMachineFunction(llvm::MachineFunction&): Assertion `Stack.empty()' failed.
+20020506-1.c
+20020510-1.c
+20071120-1.c
+compare-1.c
+pr15296.c
+pr17133.c
+pr19005.c
+
+# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
+20030914-2.c
+20040703-1.c
+20081117-1.c
+920625-1.c
+931004-11.c
+931004-13.c
+980223.c
+bitfld-5.c
+complex-7.c
+pr38969.c
+pr51323.c
+pr52129.c
+pr57130.c
+
+# Cannot select FrameIndex.
+20000412-2.c
+20000519-1.c
+20000706-4.c
+20000706-5.c
+20000801-1.c
+20000801-2.c
+20000801-4.c
+20001228-1.c
+20010116-1.c
+20010518-2.c
+20010915-1.c
+20011126-2.c
+20020413-1.c
+20020529-1.c
+20021024-1.c
+20021219-1.c
+20030218-1.c
+20030221-1.c
+20030222-1.c
+20030313-1.c
+20030828-1.c
+20030914-1.c
+20030916-1.c
+20031012-1.c
+20031201-1.c
+20040218-1.c
+20040302-1.c
+20040625-1.c
+20040823-1.c
+20041019-1.c
+20041113-1.c
+20041124-1.c
+20041126-1.c
+20041214-1.c
+20050121-1.c
+20050203-1.c
+20050502-1.c
+20050502-2.c
+20050826-2.c
+20060420-1.c
+20070212-1.c
+20070517-1.c
+20071029-1.c
+20071030-1.c
+20071108-1.c
+20071202-1.c
+20071213-1.c
+20071219-1.c
+20080502-1.c
+20080506-2.c
+20080519-1.c
+20080522-1.c
+20080604-1.c
+20081103-1.c
+20090113-1.c
+20090113-2.c
+20090113-3.c
+20090207-1.c
+20090623-1.c
+20100708-1.c
+20101013-1.c
+20111208-1.c
+20111212-1.c
+20120105-1.c
+20120808-1.c
+20120919-1.c
+20121108-1.c
+20131127-1.c
+20140425-1.c
+920411-1.c
+920501-6.c
+920501-8.c
+920726-1.c
+930518-1.c
+930622-2.c
+930930-2.c
+931004-10.c
+931004-12.c
+931004-14.c
+931004-2.c
+931004-4.c
+931004-6.c
+931004-8.c
+950710-1.c
+960215-1.c
+960327-1.c
+960513-1.c
+980205.c
+980608-1.c
+980709-1.c
+980716-1.c
+990127-1.c
+990513-1.c
+990531-1.c
+991216-2.c
+991228-1.c
+alloca-1.c
+builtin-prefetch-2.c
+cbrt.c
+complex-5.c
+complex-6.c
+conversion.c
+enum-3.c
+fprintf-chk-1.c
+frame-address.c
+loop-15.c
+loop-ivopts-2.c
+mayalias-3.c
+memcpy-1.c
+multi-ix.c
+pr15262.c
+pr20466-1.c
+pr20527-1.c
+pr27073.c
+pr27285.c
+pr28778.c
+pr28982b.c
+pr29006.c
+pr30778.c
+pr31448-2.c
+pr31448.c
+pr33142.c
+pr33870-1.c
+pr33870.c
+pr33992.c
+pr34176.c
+pr35472.c
+pr36339.c
+pr36343.c
+pr37573.c
+pr38051.c
+pr38212.c
+pr38236.c
+pr39100.c
+pr39120.c
+pr39339.c
+pr40022.c
+pr40657.c
+pr42614.c
+pr42691.c
+pr43236.c
+pr43784.c
+pr43835.c
+pr43987.c
+pr44202-1.c
+pr44575.c
+pr44852.c
+pr44942.c
+pr45070.c
+pr46309.c
+pr47538.c
+pr47925.c
+pr49218.c
+pr49279.c
+pr49390.c
+pr49419.c
+pr51466.c
+pr51877.c
+pr51933.c
+pr52760.c
+pr52979-1.c
+pr52979-2.c
+pr53645-2.c
+pr53645.c
+pr54471.c
+pr54985.c
+pr56205.c
+pr56799.c
+pr56866.c
+pr56982.c
+pr57124.c
+pr57131.c
+pr57876.c
+pr58277-1.c
+pr58365.c
+pr59229.c
+pr59358.c
+pr59643.c
+pr60960.c
+printf-chk-1.c
+pta-field-1.c
+pta-field-2.c
+regstack-1.c
+stdarg-1.c
+stdarg-2.c
+stdarg-3.c
+stdarg-4.c
+strct-stdarg-1.c
+strct-varg-1.c
+string-opt-17.c
+string-opt-18.c
+string-opt-5.c
+struct-aliasing-1.c
+va-arg-1.c
+va-arg-10.c
+va-arg-12.c
+va-arg-13.c
+va-arg-14.c
+va-arg-15.c
+va-arg-16.c
+va-arg-17.c
+va-arg-18.c
+va-arg-19.c
+va-arg-2.c
+va-arg-20.c
+va-arg-23.c
+va-arg-26.c
+va-arg-4.c
+va-arg-5.c
+va-arg-6.c
+va-arg-7.c
+va-arg-8.c
+va-arg-9.c
+va-arg-pack-1.c
+vfprintf-1.c
+vfprintf-chk-1.c
+vprintf-1.c
+vprintf-chk-1.c
+
+# Cannot select callseq_end.
+20040811-1.c
+pr43220.c
+vla-dealloc-1.c
+
+# Cannot select brind.
+20071210-1.c
+920501-4.c
+920501-5.c
+
+# Cannot select BlockAddress.
+comp-goto-1.c
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented byval arguments.
+20000412-3.c
+20000419-1.c
+20000706-1.c
+20000706-2.c
+20000707-1.c
+20000717-1.c
+20000717-5.c
+20000808-1.c
+20010605-2.c
+20011113-1.c
+20020215-1.c
+20020810-1.c
+20021118-1.c
+20040707-1.c
+20040709-1.c
+20040709-2.c
+20041201-1.c
+20050713-1.c
+20070614-1.c
+920908-2.c
+921112-1.c
+921117-1.c
+921123-2.c
+921204-1.c
+930126-1.c
+930208-1.c
+931004-5.c
+931004-9.c
+931031-1.c
+950607-2.c
+960416-1.c
+990525-1.c
+991118-1.c
+bf64-1.c
+complex-1.c
+complex-2.c
+pr15262-2.c
+pr20621-1.c
+pr23135.c
+pr30185.c
+pr42248.c
+
+# unimplemented operation lowering.
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+990826-0.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+20050107-1.c
+20050119-1.c
+20050119-2.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 7089c1f7592b..f2efefd35c52 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1049,6 +1049,7 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
     .Cases("BYTE", "byte", 8)
     .Cases("WORD", "word", 16)
     .Cases("DWORD", "dword", 32)
+    .Cases("FWORD", "fword", 48)
     .Cases("QWORD", "qword", 64)
     .Cases("MMWORD","mmword", 64)
     .Cases("XWORD", "xword", 80)
@@ -1693,12 +1694,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex(); // Eat operand size (e.g., byte, word).
     if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
       return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
     Parser.Lex(); // Eat ptr.
+    PtrInOperand = true;
   }
   Start = Tok.getLoc();
 
@@ -1754,9 +1757,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
     // of a segment override, otherwise this is a normal register reference.
-    if (getLexer().isNot(AsmToken::Colon))
+    // In case it is a normal register and there is ptr in the operand this 
+    // is an error
+    if (getLexer().isNot(AsmToken::Colon)){
+      if (PtrInOperand){
+        return ErrorOperand(Start, "expected memory operand after "
+                                   "'ptr', found register operand instead");
+      }
       return X86Operand::CreateReg(RegNo, Start, End);
-
+    }
+    
     return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index db486f9243af..b23f5c353013 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -34,6 +34,7 @@ set(sources
   X86VZeroUpper.cpp
   X86FixupLEAs.cpp
   X86WinEHState.cpp
+  X86OptimizeLEAs.cpp
   )
 
 add_llvm_target(X86CodeGen ${sources})
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 6140c5ac1e78..040143b15587 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
        * then it should be disassembled as a xacquire/xrelease not repne/rep.
        */
       if ((byte == 0xf2 || byte == 0xf3) &&
-          ((nextByte == 0xf0) |
+          ((nextByte == 0xf0) ||
           ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
         insn->xAcquireRelease = true;
       /*
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 2eba084fc147..914ccdfb49a1 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -639,13 +639,13 @@ class DarwinX86AsmBackend : public X86AsmBackend {
   /// \brief Get the compact unwind number for a given register. The number
   /// corresponds to the enum lists in compact_unwind_encoding.h.
   int getCompactUnwindRegNum(unsigned Reg) const {
-    static const uint16_t CU32BitRegs[7] = {
+    static const MCPhysReg CU32BitRegs[7] = {
       X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
     };
-    static const uint16_t CU64BitRegs[] = {
+    static const MCPhysReg CU64BitRegs[] = {
       X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
     };
-    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
     for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
       if (*CURegs == Reg)
         return Idx;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 614e43864088..191ebeac7265 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation(
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol", false);
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
-    if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference",
-                         false);
+    if (IsPCRel) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+      return;
+    }
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -168,8 +173,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
-    if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base", false);
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
 
     // A subtraction expression where either symbol is undefined is a
     // non-relocatable expression.
@@ -245,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         FixedValue = Res;
         return;
       } else {
-        report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'", false);
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unsupported relocation of variable '" +
+                                         Symbol->getName() + "'");
+        return;
       }
     } else {
-      report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'", false);
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+                              Symbol->getName() + "'");
+      return;
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -267,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
           Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation",
-                             false);
+          Asm.getContext().reportError(
+              Fixup.getLoc(), "unsupported symbol modifier in relocation");
+          return;
         } else {
           Type = MachO::X86_64_RELOC_SIGNED;
 
@@ -293,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
           }
         }
       } else {
-        if (Modifier != MCSymbolRefExpr::VK_None)
-          report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation", false);
+        if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "unsupported symbol modifier in branch relocation");
+          return;
+        }
 
         Type = MachO::X86_64_RELOC_BRANCH;
       }
@@ -310,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel",
-                           false);
-      } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation", false);
-      else {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+        return;
+      } else if (Modifier != MCSymbolRefExpr::VK_None) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "unsupported symbol modifier in relocation");
+        return;
+      } else {
         Type = MachO::X86_64_RELOC_UNSIGNED;
         unsigned Kind = Fixup.getKind();
-        if (Kind == X86::reloc_signed_4byte)
-          report_fatal_error("32-bit absolute addressing is not supported in "
-                             "64-bit mode", false);
+        if (Kind == X86::reloc_signed_4byte) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "32-bit absolute addressing is not supported in 64-bit mode");
+          return;
+        }
       }
     }
   }
@@ -351,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
 
-  if (!A->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression",
-                       false);
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(
+        Fixup.getLoc(),
+        "symbol '" + A->getName() +
+            "' can not be undefined in a subtraction expression");
+    return false;
+  }
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -364,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     const MCSymbol *SB = &B->getSymbol();
 
-    if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression",
-                         false);
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "symbol '" + B->getSymbol().getName() +
+              "' can not be undefined in a subtraction expression");
+      return false;
+    }
 
     // Select the appropriate difference relocation type.
     //
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index cd914ee7f9c8..fbec6626d99d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -23,50 +23,47 @@ class FunctionPass;
 class ImmutablePass;
 class X86TargetMachine;
 
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createX86GlobalBaseRegPass - This pass initializes a global base
-/// register for PIC on x86-32.
+/// This pass initializes a global base register for PIC on x86-32.
 FunctionPass* createX86GlobalBaseRegPass();
 
-/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
-/// to local-dynamic TLS variables so that the TLS base address for the module
-/// is only fetched once per execution path through the function.
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
 FunctionPass *createCleanupLocalDynamicTLSPass();
 
-/// createX86FloatingPointStackifierPass - This function returns a pass which
-/// converts floating point register references and pseudo instructions into
-/// floating point stack references and physical instructions.
-///
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
 FunctionPass *createX86FloatingPointStackifierPass();
 
-/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
-/// before each call to avoid transition penalty between functions encoded with
-/// AVX and SSE.
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
-/// createX86PadShortFunctions - Return a pass that pads short functions
-/// with NOOPs. This will prevent a stall when returning on the Atom.
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
-/// createX86FixupLEAs - Return a a pass that selectively replaces
-/// certain instructions (like add, sub, inc, dec, some shifts,
-/// and some multiplies) by equivalent LEA instructions, in order
-/// to eliminate execution delays in some Atom processors.
+
+/// Return a a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
 FunctionPass *createX86FixupLEAs();
 
-/// createX86CallFrameOptimization - Return a pass that optimizes
-/// the code-size of x86 call sequences. This is done by replacing
-/// esp-relative movs with pushes.
+/// Return a pass that removes redundant address recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
 FunctionPass *createX86CallFrameOptimization();
 
-/// createX86WinEHStatePass - Return an IR pass that inserts EH registration
-/// stack objects and explicit EH state updates. This pass must run after EH
-/// preparation, which does Windows-specific but architecture-neutral
-/// preparation.
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
 FunctionPass *createX86WinEHStatePass();
 
 /// Return a Machine IR pass that expands X86-specific pseudo
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 7d9f396c1e96..7fc4def15fe5 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -182,6 +182,8 @@ def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+                                       "Support LAHF and SAHF instructions">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -273,7 +275,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureSSSE3,
   FeatureFXSR,
   FeatureCMPXCHG16B,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureSlowUAMem16,
@@ -281,7 +284,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureSSE41,
   FeatureFXSR,
   FeatureCMPXCHG16B,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 
 // Atom CPUs.
@@ -299,7 +303,8 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
   FeatureSlowDivide64,
   FeatureCallRegIndirect,
   FeatureLEAUsesAG,
-  FeaturePadShortFunctions
+  FeaturePadShortFunctions,
+  FeatureLAHFSAHF
 ]>;
 def : BonnellProc<"bonnell">;
 def : BonnellProc<"atom">; // Pin the generic name to the baseline.
@@ -319,7 +324,8 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
   FeaturePRFCHW,
   FeatureSlowLEA,
   FeatureSlowIncDec,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
@@ -331,7 +337,8 @@ class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureFXSR,
   FeatureCMPXCHG16B,
   FeatureSlowBTMem,
-  FeaturePOPCNT
+  FeaturePOPCNT,
+  FeatureLAHFSAHF
 ]>;
 def : NehalemProc<"nehalem">;
 def : NehalemProc<"corei7">;
@@ -346,7 +353,8 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureSlowBTMem,
   FeaturePOPCNT,
   FeatureAES,
-  FeaturePCLMUL
+  FeaturePCLMUL,
+  FeatureLAHFSAHF
 ]>;
 def : WestmereProc<"westmere">;
 
@@ -363,7 +371,8 @@ class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureAES,
   FeaturePCLMUL,
   FeatureXSAVE,
-  FeatureXSAVEOPT
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF
 ]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
@@ -382,7 +391,8 @@ class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureXSAVEOPT,
   FeatureRDRAND,
   FeatureF16C,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
@@ -408,7 +418,8 @@ class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureFMA,
   FeatureRTM,
   FeatureHLE,
-  FeatureSlowIncDec
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
 ]>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -436,7 +447,8 @@ class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureHLE,
   FeatureADX,
   FeatureRDSEED,
-  FeatureSlowIncDec
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
 ]>;
 def : BroadwellProc<"broadwell">;
 
@@ -465,7 +477,8 @@ class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureRTM,
   FeatureHLE,
   FeatureSlowIncDec,
-  FeatureMPX
+  FeatureMPX,
+  FeatureLAHFSAHF
 ]>;
 def : KnightsLandingProc<"knl">;
 
@@ -500,7 +513,8 @@ class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureSlowIncDec,
   FeatureMPX,
   FeatureXSAVEC,
-  FeatureXSAVES
+  FeatureXSAVES,
+  FeatureLAHFSAHF
 ]>;
 def : SkylakeProc<"skylake">;
 def : SkylakeProc<"skx">; // Legacy alias.
@@ -544,10 +558,10 @@ def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
                                FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
 def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
 
 // Bobcat
 def : Proc<"btver1", [
@@ -560,7 +574,8 @@ def : Proc<"btver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Jaguar
@@ -580,7 +595,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeaturePOPCNT,
   FeatureXSAVE,
   FeatureXSAVEOPT,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Bulldozer
@@ -598,7 +614,8 @@ def : Proc<"bdver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 // Piledriver
 def : Proc<"bdver2", [
@@ -619,7 +636,8 @@ def : Proc<"bdver2", [
   FeatureBMI,
   FeatureTBM,
   FeatureFMA,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Steamroller
@@ -643,7 +661,8 @@ def : Proc<"bdver3", [
   FeatureFMA,
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 
 // Excavator
@@ -666,7 +685,8 @@ def : Proc<"bdver4", [
   FeatureTBM,
   FeatureFMA,
   FeatureXSAVEOPT,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 
 def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 23990b01ba18..fc6ee1752f1f 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -500,7 +500,8 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
-    if (!TFL->hasFP(MF) && MF.getMMI().usePreciseUnwindInfo())
+    // TODO: This is needed only if we require precise CFA.
+    if (!TFL->hasFP(MF))
       TFL->BuildCFI(MBB, std::next(Push), DL, 
                     MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 682f75c7f51c..7841a84326b4 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1003,7 +1003,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   else if (IsFunclet)
     Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
 
-  if (IsWin64Prologue && IsFunclet & !IsClrFunclet) {
+  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
     // Immediately spill establisher into the home slot.
     // The runtime cares about this.
     // MOV64mr %rdx, 16(%rsp)
@@ -2524,10 +2524,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // (Pushes of argument for frame setup, callee pops for frame destroy)
     Amount -= InternalAmt;
 
-    // If this is a callee-pop calling convention, and we're emitting precise
-    // SP-based CFI, emit a CFA adjust for the amount the callee popped.
-    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF) && 
-        MMI.usePreciseUnwindInfo())
+    // TODO: This is needed only if we require precise CFA.
+    // If this is a callee-pop calling convention, emit a CFA adjust for
+    // the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
       BuildCFI(MBB, I, DL, 
                MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
 
@@ -2548,11 +2548,14 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // offset to be correct at each call site, while for debugging we want
       // it to be more precise.
       int CFAOffset = Amount;
-      if (!MMI.usePreciseUnwindInfo())
-        CFAOffset += InternalAmt;
-      CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
-      BuildCFI(MBB, I, DL, 
-               MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      // TODO: When not using precise CFA, we also need to adjust for the
+      // InternalAmt here.
+
+      if (CFAOffset) {
+        CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+        BuildCFI(MBB, I, DL, 
+                 MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      }
     }
 
     return;
@@ -2592,6 +2595,12 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
   return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
 }
 
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // If we may need to emit frameless compact unwind information, give
+  // up as this is currently broken: PR25614.
+  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     DebugLoc DL, bool RestoreSP) const {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 68dc8edfd430..3ab41b4a5789 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -134,6 +134,9 @@ class X86FrameLowering : public TargetFrameLowering {
   /// \p MBB will be correctly handled by the target.
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 17573733b3ec..f16c5015445a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1337,6 +1337,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
@@ -1344,6 +1345,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
 
@@ -1381,6 +1383,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
+      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
     }
     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
@@ -1456,6 +1463,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
@@ -1707,8 +1715,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  if (!Subtarget->is64Bit())
+  if (!Subtarget->is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -8020,6 +8030,7 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget *Subtarget,
@@ -8092,6 +8103,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
     // Only AVX2 has register broadcasts.
     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
       return SDValue();
+  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(V);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT AddrVT = BaseAddr.getValueType();
+    EVT SVT = VT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getNode(
+        ISD::ADD, DL, AddrVT, BaseAddr,
+        DAG.getConstant(Offset, DL, AddrVT));
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
     // We can't broadcast from a vector register without AVX2, and we can only
     // broadcast from the zero-element of a vector register.
@@ -13927,6 +13952,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                             DAG.getConstant(8, dl, MVT::i8));
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
@@ -15619,54 +15647,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                SplitStack;
   SDLoc dl(Op);
 
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDNode* Node = Op.getNode();
-
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
-        " not tell us which reg is the stack pointer!");
+                    " not tell us which reg is the stack pointer!");
     EVT VT = Node->getValueType(0);
-    SDValue Tmp1 = SDValue(Node, 0);
-    SDValue Tmp2 = SDValue(Node, 1);
     SDValue Tmp3 = Node->getOperand(2);
-    SDValue Chain = Tmp1.getOperand(0);
-
-    // Chain the dynamic stack allocation so that it doesn't modify the stack
-    // pointer when other instructions are using the stack.
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
-        SDLoc(Node));
 
-    SDValue Size = Tmp2.getOperand(1);
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
-    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
-      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-          DAG.getConstant(-(uint64_t)Align, dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
-
-    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
-        DAG.getIntPtrConstant(0, dl, true), SDValue(),
-        SDLoc(Node));
-
-    SDValue Ops[2] = { Tmp1, Tmp2 };
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  // Get the inputs.
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  EVT VT = Op.getNode()->getValueType(0);
-
-  bool Is64Bit = Subtarget->is64Bit();
-  MVT SPTy = getPointerTy(DAG.getDataLayout());
-
-  if (SplitStack) {
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
@@ -15684,10 +15698,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
-    SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
@@ -15709,9 +15721,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
     }
 
-    SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, dl);
+    Result = SP;
   }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -15988,58 +16005,90 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG, SDLoc dl) {
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
+
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
+
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
+
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
+
 /// \brief Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
-                                    SDValue PreservedSrc,
-                                    const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-    MVT VT = Op.getSimpleValueType();
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask;
-    unsigned OpcodeSelect = ISD::VSELECT;
-    SDLoc dl(Op);
+                  SDValue PreservedSrc,
+                  const X86Subtarget *Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
 
-    if (isAllOnesConstant(Mask))
-      return Op;
+  if (isAllOnesConstant(Mask))
+    return Op;
 
-    if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
-      MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
-      VMask = DAG.getBitcast(MaskVT,
-                             DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask));
-    } else {
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-      // are extracted by EXTRACT_SUBVECTOR.
-      VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                          DAG.getBitcast(BitcastVT, Mask),
-                          DAG.getIntPtrConstant(0, dl));
-    }
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
-    switch (Op.getOpcode()) {
-    default: break;
-    case X86ISD::PCMPEQM:
-    case X86ISD::PCMPGTM:
-    case X86ISD::CMPM:
-    case X86ISD::CMPMU:
-      return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
-    case X86ISD::VFPCLASS:
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
     case X86ISD::VFPCLASSS:
-      return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
-    case X86ISD::VTRUNC:
-    case X86ISD::VTRUNCS:
-    case X86ISD::VTRUNCUS:
-      // We can't use ISD::VSELECT here because it is not always "Legal"
-      // for the destination type. For example vpmovqb require only AVX512
-      // and vselect that can operate on byte element type require BWI
-      OpcodeSelect = X86ISD::SELECT;
-      break;
-    }
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
 /// \brief Creates an SDNode for a predicated scalar operation.
@@ -16574,15 +16623,22 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Mask = Op.getOperand(3);
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      SDLoc dl(Op);
-      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getBitcast(BitcastVT, Mask),
-                                  DAG.getIntPtrConstant(0, dl));
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                          Op.getOperand(2));
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
     default:
       break;
     }
@@ -19649,6 +19705,47 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
 
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   assert(Subtarget->hasAVX512() &&
@@ -19678,6 +19775,62 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
   return Op;
 }
 
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    SDValue Src0 = N->getSrc0();
+    Src0 = ExtendToType(Src0, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                        N->getBasePtr(), Mask, Src0,
+                                        N->getMemoryVT(), N->getMemOperand(),
+                                        N->getExtensionType());
+
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewLoad.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                              Mask, N->getMemoryVT(), N->getMemOperand(),
+                              N->isTruncatingStore());
+  }
+  return Op;
+}
+
 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget->hasAVX512() &&
@@ -19795,6 +19948,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   }
@@ -19938,6 +20093,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
     }
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
+  }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
@@ -26357,6 +26517,31 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+    // These logic operations may be executed in the integer domain.
+    SDLoc dl(N);
+    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    unsigned IntOpcode = 0;
+    switch (N->getOpcode()) {
+      default: llvm_unreachable("Unexpected FP logic op");
+      case X86ISD::FOR: IntOpcode = ISD::OR; break;
+      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+      case X86ISD::FAND: IntOpcode = ISD::AND; break;
+      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+  }
+  return SDValue();
+}
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget *Subtarget) {
@@ -26372,19 +26557,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
     if (C->getValueAPF().isPosZero())
       return N->getOperand(0);
 
-  EVT VT = N->getValueType(0);
-  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
-    SDLoc dl(N);
-    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
-    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
-
-    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
-    unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR;
-    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
-  }
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
@@ -26409,7 +26582,8 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
 }
 
 /// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -26420,11 +26594,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
   // FANDN(0.0, x) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -26435,7 +26610,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 static SDValue PerformBTCombine(SDNode *N,
@@ -27251,8 +27426,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
-  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
-  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d15d0dc96e6f..58206c6acaa6 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2396,16 +2396,6 @@ defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
 defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
 defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
 
-multiclass avx512_mask_unpck_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-defm : avx512_mask_unpck_int<"kunpck",  "KUNPCK">;
-
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
                               SDNode OpNode, Predicate prd> {
@@ -2496,6 +2486,9 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
 
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
 def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
           (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
 
@@ -2773,22 +2766,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
          (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
-         (VMOVUPSZmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
-
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2850,17 +2827,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
                            (v16i32 VR512:$src))),
                   (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
 }
-// NoVLX patterns
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
-         (VMOVDQU32Zmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
 
 // Move Int Doubleword to Packed Double Int
 //
@@ -2973,53 +2939,60 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_move_scalar <string asm, RegisterClass RC,
-                              SDNode OpNode, ValueType vt,
-                              X86MemOperand x86memop, PatFrag mem_pat> {
-  let hasSideEffects = 0 in {
-  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
-              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
-                                      (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in
-  def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
-              (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
-              !strconcat(asm,
-                "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
-              [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
-  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
-              EVEX, VEX_LIG;
+multiclass avx512_move_scalar <string asm, SDNode OpNode, 
+                              X86VectorVTInfo _> {
+  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), 
+                    (ins _.RC:$src1, _.RC:$src2),
+                    asm, "$src2, $src1","$src1, $src2", 
+                    (_.VT (OpNode (_.VT _.RC:$src1),
+                                   (_.VT _.RC:$src2))),
+                                   IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let Constraints = "$src1 = $dst" , mayLoad = 1 in
+    defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+                    (outs _.RC:$dst), 
+                    (ins _.ScalarMemOp:$src),
+                    asm,"$src","$src",
+                    (_.VT (OpNode (_.VT _.RC:$src1), 
+                               (_.VT (scalar_to_vector 
+                                     (_.ScalarLdFrag addr:$src)))))>, EVEX;
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), 
+               (ins _.RC:$src1, _.FRC:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+                                      (scalar_to_vector _.FRC:$src2))))],
+               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let mayLoad = 1 in
+    def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+               _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+  }
   let mayStore = 1 in {
-  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG;
-  def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
-             !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
-             [], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG, EVEX_K;
+    def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+               EVEX;
+    def mrk: AVX512PI<0x11, MRMDestMem, (outs), 
+                (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+                !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+                [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
   } // mayStore
-  } //hasSideEffects = 0
 }
 
-let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
-                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
 
-let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
-                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+                                  VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+          (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
 
 def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+          (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
           (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 0467a64d7e51..fd800cf077f7 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -223,9 +223,12 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
                  SDNode OpNode> {
+  let ExeDomain = SSEPackedSingle in
   defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
                         FR32, f32mem>,
             fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+  let ExeDomain = SSEPackedDouble in
   defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
                         FR64, f64mem>,
             fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
@@ -374,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 } // isCodeGenOnly = 1
 }
 
-defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
-                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
-                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmadd_sd>;
-defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
-                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
-                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmsub_sd>;
-defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
-                        X86Fnmadd, loadf32>,
-                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
-                        X86Fnmadd, loadf64>,
-                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmadd_sd>;
-defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
-                        X86Fnmsub, loadf32>,
-                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
-                        X86Fnmsub, loadf64>,
-                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmsub_sd>;
-
 let ExeDomain = SSEPackedSingle in {
+  // Scalar Instructions
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmadd_ss>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmsub_ss>;
+  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                          X86Fnmadd, loadf32>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmadd_ss>;
+  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                          X86Fnmsub, loadf32>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmsub_ss>;
+  // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
@@ -419,6 +409,22 @@ let ExeDomain = SSEPackedSingle in {
 }
 
 let ExeDomain = SSEPackedDouble in {
+  // Scalar Instructions
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmadd_sd>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmsub_sd>;
+  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                          X86Fnmadd, loadf64>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmadd_sd>;
+  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                          X86Fnmsub, loadf64>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmsub_sd>;
+  // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 12da3a9319e6..7812e4490e7d 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1650,6 +1650,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
+    // ADX foldable instructions
+    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
+    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
+    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
+    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
+
     // AVX-512 foldable instructions
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
@@ -4385,7 +4391,33 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   int Reg = FromEFLAGS ? DestReg : SrcReg;
   bool is32 = X86::GR32RegClass.contains(Reg);
   bool is64 = X86::GR64RegClass.contains(Reg);
+
   if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
+      return;
+    }
+
     // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
     // inefficient. Instead:
     //   - Save the overflow flag OF into AL using SETO, and restore it using a
@@ -4407,14 +4439,20 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Notice that we have to adjust the stack if we don't want to clobber the
     // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
 
-    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
-    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
-    int Pop = is64 ? X86::POP64r : X86::POP32r;
-    int AX = is64 ? X86::RAX : X86::EAX;
 
-    bool AXDead = (Reg == AX) ||
-                  (MachineBasicBlock::LQR_Dead ==
-                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    bool AXDead = (Reg == AX);
+    // FIXME: The above could figure out that AX is dead in more cases with:
+    //          || (MachineBasicBlock::LQR_Dead ==
+    //            MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    //
+    //        Unfortunately this is slightly broken, see PR24535 and the likely
+    //        related PR25033 PR24991 PR24992 PR25201. These issues seem to
+    //        showcase sub-register / super-register confusion: a previous kill
+    //        of AH but no kill of AL leads computeRegisterLiveness to
+    //        erroneously conclude that AX is dead.
+    //
+    //        Once fixed, also update cmpxchg-clobber-flags.ll and
+    //        peephole-na-phys-copy-folding.ll.
 
     if (!AXDead)
       BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0571b07d2f8b..b412f8fb3ecb 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -770,6 +770,7 @@ def HasVLX       : Predicate<"Subtarget->hasVLX()">,
                      AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -799,6 +800,7 @@ def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
@@ -1502,10 +1504,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
 let SchedRW = [WriteALU] in {
 let Defs = [EFLAGS], Uses = [AH] in
 def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
-                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+               Requires<[HasLAHFSAHF]>;
 let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
-                IIC_AHF>;  // AH = flags
+                IIC_AHF>,  // AH = flags
+               Requires<[HasLAHFSAHF]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -2569,8 +2573,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
 def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
 
-def : MnemonicAlias<"loopz",  "loope",  "att">;
-def : MnemonicAlias<"loopnz", "loopne", "att">;
+def : MnemonicAlias<"loopz",  "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
 
 def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
@@ -2651,14 +2655,14 @@ def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
 def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
 def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
 def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
-def : MnemonicAlias<"fcomip",   "fcompi",   "att">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
 def : MnemonicAlias<"fildq",    "fildll",   "att">;
 def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
 def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
 def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
 def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
 def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
-def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
 def : MnemonicAlias<"fwait",    "wait">;
 
 def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a93240bd717c..a545335dd5dd 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2906,7 +2906,7 @@ let isCodeGenOnly = 1 in {
 // Multiclass for vectors using the X86 logical operation aliases for FP.
 multiclass sse12_fp_packed_vector_logical_alias<
     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
-  let Predicates = [HasAVX, NoVLX] in {
+  let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
               VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
               PS, VEX_4V;
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bb2f7248b0e9..8f8a100cea04 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -30,7 +30,7 @@ enum IntrinsicType {
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK
 };
 
 struct IntrinsicData {
@@ -143,7 +143,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
@@ -341,7 +341,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
-
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
@@ -807,6 +809,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::MOVDDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
                      X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
                      X86ISD::MOVSHDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
@@ -1823,7 +1829,7 @@ static void verifyIntrinsicTables() {
          "Intrinsic data tables should have unique entries");
 }
 
-// X86 specific compare constants.
+// X86 specific compare constants.
 // They must be kept in synch with avxintrin.h
 #define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
 #define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a481e349061e..a50b44950ff1 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1145,8 +1145,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const X86FrameLowering* FrameLowering =
         MF->getSubtarget<X86Subtarget>().getFrameLowering();
     bool hasFP = FrameLowering->hasFP(*MF);
-
-    bool NeedsDwarfCFI = MMI->usePreciseUnwindInfo();
+    
+    // TODO: This is needed only if we require precise CFA.
+    bool NeedsDwarfCFI = 
+         (MMI->hasDebugInfo() || MF->getFunction()->needsUnwindTableEntry());
     int stackGrowth = -RI->getSlotSize();
 
     if (NeedsDwarfCFI && !hasFP) {
@@ -1375,7 +1377,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           if (isa<UndefValue>(COp)) {
             CS << "u";
           } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-            CS << CI->getZExtValue();
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              auto Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
           } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
             SmallString<32> Str;
             CF->getValueAPF().toString(Str);
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..da83c8ad8248
--- /dev/null
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,322 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does one thing:
+// 1) Address calculations in load and store instructions are replaced by
+//    existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+  /// \brief Loop over all of the basic blocks, replacing address
+  /// calculations in load and store instructions, if it's already
+  /// been calculated by LEA. Also, remove redundant LEAs.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Negative result means, that instructions occur in reverse order.
+  int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// address calculation in \p MI instruction. Return the address displacement
+  /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+  /// and \p Dist.
+  bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                     const MachineInstr &MI, MachineInstr *&LEA,
+                     int64_t &AddrDispShift, int &Dist);
+
+  /// \brief Returns true if two machine operand are identical and they are not
+  /// physical registers.
+  bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+  /// \brief Returns true if the instruction is LEA.
+  bool isLEA(const MachineInstr &MI);
+
+  /// \brief Returns true if two instructions have memory operands that only
+  /// differ by displacement. The numbers of the first memory operands for both
+  /// instructions are specified through \p N1 and \p N2. The address
+  /// displacement is returned through AddrDispShift.
+  bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                      const MachineInstr &MI2, unsigned N2,
+                      int64_t &AddrDispShift);
+
+  /// \brief Find all LEA instructions in the basic block.
+  void findLEAs(const MachineBasicBlock &MBB,
+                SmallVectorImpl<MachineInstr *> &List);
+
+  /// \brief Removes redundant address calculations.
+  bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+
+  static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                   const MachineInstr &Last) {
+  const MachineBasicBlock *MBB = First.getParent();
+
+  // Both instructions must be in the same basic block.
+  assert(Last.getParent() == MBB &&
+         "Instructions are in different basic blocks");
+
+  return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
+         std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+//    the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+//    register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+//    possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                                    const MachineInstr &MI, MachineInstr *&LEA,
+                                    int64_t &AddrDispShift, int &Dist) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                X86II::getOperandBias(Desc);
+
+  LEA = nullptr;
+
+  // Loop over all LEA instructions.
+  for (auto DefMI : List) {
+    int64_t AddrDispShiftTemp = 0;
+
+    // Compare instructions memory operands.
+    if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+      continue;
+
+    // Make sure address displacement fits 4 bytes.
+    if (!isInt<32>(AddrDispShiftTemp))
+      continue;
+
+    // Check that LEA def register can be used as MI address base. Some
+    // instructions can use a limited set of registers as address base, for
+    // example MOV8mr_NOREX. We could constrain the register class of the LEA
+    // def to suit MI, however since this case is very rare and hard to
+    // reproduce in a test it's just more reliable to skip the LEA.
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+        MRI->getRegClass(DefMI->getOperand(0).getReg()))
+      continue;
+
+    // Choose the closest LEA instruction from the list, prior to MI if
+    // possible. Note that we took into account resulting address displacement
+    // as well. Also note that the list is sorted by the order in which the LEAs
+    // occur, so the break condition is pretty simple.
+    int DistTemp = calcInstrDist(*DefMI, MI);
+    assert(DistTemp != 0 &&
+           "The distance between two different instructions cannot be zero");
+    if (DistTemp > 0 || LEA == nullptr) {
+      // Do not update return LEA, if the current one provides a displacement
+      // which fits in 1 byte, while the new candidate does not.
+      if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+          isInt<8>(AddrDispShift))
+        continue;
+
+      LEA = DefMI;
+      AddrDispShift = AddrDispShiftTemp;
+      Dist = DistTemp;
+    }
+
+    // FIXME: Maybe we should not always stop at the first LEA after MI.
+    if (DistTemp < 0)
+      break;
+  }
+
+  return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+                                    const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                                     const MachineInstr &MI2, unsigned N2,
+                                     int64_t &AddrDispShift) {
+  // Address base, scale, index and segment operands must be identical.
+  static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+                                        X86::AddrIndexReg, X86::AddrSegmentReg};
+  for (auto &N : IdenticalOpNums)
+    if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+      return false;
+
+  // Address displacement operands may differ by a constant.
+  const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+  if (!isIdenticalOp(*Op1, *Op2)) {
+    if (Op1->isImm() && Op2->isImm())
+      AddrDispShift = Op1->getImm() - Op2->getImm();
+    else if (Op1->isGlobal() && Op2->isGlobal() &&
+             Op1->getGlobal() == Op2->getGlobal())
+      AddrDispShift = Op1->getOffset() - Op2->getOffset();
+    else
+      return false;
+  }
+
+  return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+                               SmallVectorImpl<MachineInstr *> &List) {
+  for (auto &MI : MBB) {
+    if (isLEA(MI))
+      List.push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+    const SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  assert(List.size() > 0);
+  MachineBasicBlock *MBB = List[0]->getParent();
+
+  // Process all instructions in basic block.
+  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+    unsigned Opcode = MI.getOpcode();
+
+    // Instruction must be load or store.
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+    // If instruction has no memory operand - skip it.
+    if (MemOpNo < 0)
+      continue;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // Get the best LEA instruction to replace address calculation.
+    MachineInstr *DefMI;
+    int64_t AddrDispShift;
+    int Dist;
+    if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+      continue;
+
+    // If LEA occurs before current instruction, we can freely replace
+    // the instruction. If LEA occurs after, we can lift LEA above the
+    // instruction and this way to be able to replace it. Since LEA and the
+    // instruction have similar memory operands (thus, the same def
+    // instructions for these operands), we can always do that, without
+    // worries of using registers before their defs.
+    if (Dist < 0) {
+      DefMI->removeFromParent();
+      MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+    }
+
+    // Since we can possibly extend register lifetime, clear kill flags.
+    MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+    ++NumSubstLEAs;
+    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+    // Change instruction operands.
+    MI.getOperand(MemOpNo + X86::AddrBaseReg)
+        .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+    MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+    MI.getOperand(MemOpNo + X86::AddrIndexReg)
+        .ChangeToRegister(X86::NoRegister, false);
+    MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+    MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+        .ChangeToRegister(X86::NoRegister, false);
+
+    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  // Perform this optimization only if we care about code size.
+  if (!MF.getFunction()->optForSize())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Process all basic blocks.
+  for (auto &MBB : MF) {
+    SmallVector<MachineInstr *, 16> LEAs;
+
+    // Find all LEA instructions in basic block.
+    findLEAs(MBB, LEAs);
+
+    // If current basic block has no LEAs, move on to the next one.
+    if (LEAs.empty())
+      continue;
+
+    // Remove redundant address calculations.
+    Changed |= removeRedundantAddrCalc(LEAs);
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a74941c271ce..64f094ea9f57 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -248,6 +248,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_SaveList;
     return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_SaveList;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
@@ -310,6 +314,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_RegMask;
     return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_RegMask;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce79fcf9ad81..b1a01614b4a1 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   return false;
 }
 
-SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile,
-                                         MachinePointerInfo DstPtrInfo) const {
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
@@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
-    if (const char *bzeroEntry =  V &&
+    if (const char *bzeroEntry = V &&
         V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
-      EVT IntPtr =
-          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
@@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                    0)
         .setDiscardResult();
 
-      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       BytesLeft = SizeVal % UBytes;
     }
 
-    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
-                              InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+                             InFlag);
     InFlag = Chain.getValue(1);
   } else {
     AVT = MVT::i8;
@@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
                                                CVT));
-    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
-                                                             X86::ECX,
-                              Left, InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                             Left, InFlag);
     InFlag = Chain.getValue(1);
     Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
@@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   unsigned BytesLeft = SizeVal % UBytes;
 
   SDValue InFlag;
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
-                                                              X86::ECX,
-                            Count, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
-                                                              X86::EDI,
-                            Dst, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
-                                                              X86::ESI,
-                            Src, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+                           Src, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 44a46b7e07a2..f90a0b0d04f1 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -189,6 +189,15 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit,+sse2";
   }
 
+  // LAHF/SAHF are always supported in non-64-bit mode.
+  if (!In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+sahf," + FullFS;
+    else
+      FullFS = "+sahf";
+  }
+
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
@@ -264,6 +273,7 @@ void X86Subtarget::initializeEnvironment() {
   HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
+  HasLAHFSAHF = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index fd896c2857f6..b58a630d4630 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -152,6 +152,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Processor has RDSEED instructions.
   bool HasRDSEED;
 
+  /// Processor has LAHF/SAHF instructions.
+  bool HasLAHFSAHF;
+
   /// True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -374,6 +377,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 2e869eb7c3cd..0e7e4c0c84a9 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -254,6 +254,9 @@ bool X86PassConfig::addPreISel() {
 }
 
 void X86PassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86OptimizeLEAs());
+
   addPass(createX86CallFrameOptimization());
 }
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index cf7a826ea85d..d33d57584128 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -528,6 +528,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // FIXME: Need a better design of the cost table to handle non-simple types of
+  // potential massive combinations (elem_num x src_type x dst_type).
+
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
@@ -705,7 +708,38 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
   };
 
-  static const TypeConversionCostTblEntry SSE2ConvTbl[] = {
+  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 30 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+  };
+
+  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
@@ -726,13 +760,42 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 14 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  7 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 31 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
   };
 
   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
   std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
 
   if (ST->hasSSE2() && !ST->hasAVX()) {
-    if (const auto *Entry = ConvertCostTableLookup(SSE2ConvTbl, ISD,
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
       return LTSrc.first * Entry->Cost;
   }
@@ -770,6 +833,20 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
       return Entry->Cost;
   }
 
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index c2359a8a172e..4d137e9fe588 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -24,6 +24,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "function-import"
@@ -50,53 +53,118 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
+namespace {
+/// Helper to load on demand a Module from file and cache it for subsequent
+/// queries. It can be used with the FunctionImporter.
+class ModuleLazyLoaderCache {
+  /// Cache of lazily loaded module for import.
+  StringMap<std::unique_ptr<Module>> ModuleMap;
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule;
+
+public:
+  /// Create the loader, Module will be initialized in \p Context.
+  ModuleLazyLoaderCache(std::function<
+      std::unique_ptr<Module>(StringRef FileName)> createLazyModule)
+      : createLazyModule(createLazyModule) {}
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  Module &operator()(StringRef FileName);
+};
+
 // Get a Module for \p FileName from the cache, or load it lazily.
-Module &ModuleLazyLoaderCache::operator()(StringRef FileName) {
-  auto &Module = ModuleMap[FileName];
+Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) {
+  auto &Module = ModuleMap[Identifier];
   if (!Module)
-    Module = loadFile(FileName, Context);
+    Module = createLazyModule(Identifier);
   return *Module;
 }
+} // anonymous namespace
 
 /// Walk through the instructions in \p F looking for external
 /// calls not already in the \p CalledFunctions set. If any are
 /// found they are added to the \p Worklist for importing.
-static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
+static void findExternalCalls(const Module &DestModule, Function &F,
+                              const FunctionInfoIndex &Index,
+                              StringSet<> &CalledFunctions,
                               SmallVector<StringRef, 64> &Worklist) {
+  // We need to suffix internal function calls imported from other modules,
+  // prepare the suffix ahead of time.
+  std::string Suffix;
+  if (F.getParent() != &DestModule)
+    Suffix =
+        (Twine(".llvm.") +
+         Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str();
+
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (isa<CallInst>(I)) {
         auto CalledFunction = cast<CallInst>(I).getCalledFunction();
         // Insert any new external calls that have not already been
         // added to set/worklist.
-        if (CalledFunction && CalledFunction->hasName() &&
-            CalledFunction->isDeclaration() &&
-            !CalledFunctions.count(CalledFunction->getName())) {
-          CalledFunctions.insert(CalledFunction->getName());
-          Worklist.push_back(CalledFunction->getName());
+        if (!CalledFunction || !CalledFunction->hasName())
+          continue;
+        // Ignore intrinsics early
+        if (CalledFunction->isIntrinsic()) {
+          assert(CalledFunction->getIntrinsicID() != 0);
+          continue;
+        }
+        auto ImportedName = CalledFunction->getName();
+        auto Renamed = (ImportedName + Suffix).str();
+        // Rename internal functions
+        if (CalledFunction->hasInternalLinkage()) {
+          ImportedName = Renamed;
+        }
+        auto It = CalledFunctions.insert(ImportedName);
+        if (!It.second) {
+          // This is a call to a function we already considered, skip.
+          continue;
+        }
+        // Ignore functions already present in the destination module
+        auto *SrcGV = DestModule.getNamedValue(ImportedName);
+        if (SrcGV) {
+          assert(isa<Function>(SrcGV) && "Name collision during import");
+          if (!cast<Function>(SrcGV)->isDeclaration()) {
+            DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring "
+                         << ImportedName << " already in DestinationModule\n");
+            continue;
+          }
         }
+
+        Worklist.push_back(It.first->getKey());
+        DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                     << ": Adding callee for : " << ImportedName << " : "
+                     << F.getName() << "\n");
       }
     }
   }
 }
 
 // Helper function: given a worklist and an index, will process all the worklist
-// and import them based on the summary information
-static unsigned ProcessImportWorklist(
+// and decide what to import based on the summary information.
+//
+// Nothing is actually imported, functions are materialized in their source
+// module and analyzed there.
+//
+// \p ModuleToFunctionsToImportMap is filled with the set of Function to import
+// per Module.
+static void GetImportList(
     Module &DestModule, SmallVector<StringRef, 64> &Worklist,
-    StringSet<> &CalledFunctions, Linker &TheLinker,
-    const FunctionInfoIndex &Index,
-    std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
-  unsigned ImportCount = 0;
+    StringSet<> &CalledFunctions,
+    std::map<StringRef, std::pair<Module *, DenseSet<const GlobalValue *>>> &
+        ModuleToFunctionsToImportMap,
+    const FunctionInfoIndex &Index, ModuleLazyLoaderCache &ModuleLoaderCache) {
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
-    DEBUG(dbgs() << "Process import for " << CalledFunctionName << "\n");
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for "
+                 << CalledFunctionName << "\n");
 
     // Try to get a summary for this function call.
     auto InfoList = Index.findFunctionInfoList(CalledFunctionName);
     if (InfoList == Index.end()) {
-      DEBUG(dbgs() << "No summary for " << CalledFunctionName
-                   << " Ignoring.\n");
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for "
+                   << CalledFunctionName << " Ignoring.\n");
       continue;
     }
     assert(!InfoList->second.empty() && "No summary, error at import?");
@@ -108,80 +176,75 @@ static unsigned ProcessImportWorklist(
     auto *Summary = Info->functionSummary();
     if (!Summary) {
       // FIXME: in case we are lazyloading summaries, we can do it now.
-      DEBUG(dbgs() << "Missing summary for  " << CalledFunctionName
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << ": Missing summary for  " << CalledFunctionName
                    << ", error at import?\n");
       llvm_unreachable("Missing summary");
     }
 
     if (Summary->instCount() > ImportInstrLimit) {
-      DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with "
-                   << Summary->instCount() << " instructions (limit "
-                   << ImportInstrLimit << ")\n");
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of "
+                   << CalledFunctionName << " with " << Summary->instCount()
+                   << " instructions (limit " << ImportInstrLimit << ")\n");
       continue;
     }
 
     // Get the module path from the summary.
-    auto FileName = Summary->modulePath();
-    DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName
-                 << "\n");
+    auto ModuleIdentifier = Summary->modulePath();
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing "
+                 << CalledFunctionName << " from " << ModuleIdentifier << "\n");
 
-    // Get the module for the import (potentially from the cache).
-    auto &Module = LazyModuleLoader(FileName);
-    assert(&Module.getContext() == &DestModule.getContext());
+    auto &SrcModule = ModuleLoaderCache(ModuleIdentifier);
 
     // The function that we will import!
-    GlobalValue *SGV = Module.getNamedValue(CalledFunctionName);
-    StringRef ImportFunctionName = CalledFunctionName;
+    GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName);
+
     if (!SGV) {
-      // Might be local in source Module, promoted/renamed in DestModule.
+      // The destination module is referencing function using their renamed name
+      // when importing a function that was originally local in the source
+      // module. The source module we have might not have been renamed so we try
+      // to remove the suffix added during the renaming to recover the original
+      // name in the source module.
       std::pair<StringRef, StringRef> Split =
           CalledFunctionName.split(".llvm.");
-      SGV = Module.getNamedValue(Split.first);
-#ifndef NDEBUG
-      // Assert that Split.second is module id
-      uint64_t ModuleId;
-      assert(!Split.second.getAsInteger(10, ModuleId));
-      assert(ModuleId == Index.getModuleId(FileName));
-#endif
+      SGV = SrcModule.getNamedValue(Split.first);
+      assert(SGV && "Can't find function to import in source module");
+    }
+    if (!SGV) {
+      report_fatal_error(Twine("Can't load function '") + CalledFunctionName +
+                         "' in Module '" + SrcModule.getModuleIdentifier() +
+                         "', error in the summary?\n");
     }
+
     Function *F = dyn_cast<Function>(SGV);
     if (!F && isa<GlobalAlias>(SGV)) {
       auto *SGA = dyn_cast<GlobalAlias>(SGV);
       F = dyn_cast<Function>(SGA->getBaseObject());
-      ImportFunctionName = F->getName();
-    }
-    if (!F) {
-      errs() << "Can't load function '" << CalledFunctionName << "' in Module '"
-             << FileName << "', error in the summary?\n";
-      llvm_unreachable("Can't load function in Module");
+      CalledFunctionName = F->getName();
     }
+    assert(F && "Imported Function is ... not a Function");
 
     // We cannot import weak_any functions/aliases without possibly affecting
     // the order they are seen and selected by the linker, changing program
     // semantics.
     if (SGV->hasWeakAnyLinkage()) {
-      DEBUG(dbgs() << "Ignoring import request for weak-any "
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << ": Ignoring import request for weak-any "
                    << (isa<Function>(SGV) ? "function " : "alias ")
-                   << CalledFunctionName << " from " << FileName << "\n");
+                   << CalledFunctionName << " from "
+                   << SrcModule.getModuleIdentifier() << "\n");
       continue;
     }
 
-    // Link in the specified function.
-    DenseSet<const GlobalValue *> FunctionsToImport;
-    FunctionsToImport.insert(F);
-    if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index,
-                               &FunctionsToImport))
-      report_fatal_error("Function Import: link error");
+    // Add the function to the import list
+    auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()];
+    Entry.first = &SrcModule;
+    Entry.second.insert(F);
 
-    // Process the newly imported function and add callees to the worklist.
-    GlobalValue *NewGV = DestModule.getNamedValue(ImportFunctionName);
-    assert(NewGV);
-    Function *NewF = dyn_cast<Function>(NewGV);
-    assert(NewF);
-    findExternalCalls(*NewF, CalledFunctions, Worklist);
-    ++ImportCount;
+    // Process the newly imported functions and add callees to the worklist.
+    F->materialize();
+    findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist);
   }
-  return ImportCount;
 }
 
 // Automatically import functions in Module \p DestModule based on the summaries
@@ -190,7 +253,7 @@ static unsigned ProcessImportWorklist(
 // The current implementation imports every called functions that exists in the
 // summaries index.
 bool FunctionImporter::importFunctions(Module &DestModule) {
-  DEBUG(errs() << "Starting import for Module "
+  DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
@@ -200,7 +263,7 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   for (auto &F : DestModule) {
     if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
-    findExternalCalls(F, CalledFunctions, Worklist);
+    findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist);
   }
   if (Worklist.empty())
     return false;
@@ -210,10 +273,33 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   // Linker that will be used for importing function
   Linker TheLinker(DestModule, DiagnosticHandler);
 
-  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions,
-                                         TheLinker, Index, getLazyModule);
+  // Map of Module -> List of Function to import from the Module
+  std::map<StringRef, std::pair<Module *, DenseSet<const GlobalValue *>>>
+      ModuleToFunctionsToImportMap;
+
+  // Analyze the summaries and get the list of functions to import by
+  // populating ModuleToFunctionsToImportMap
+  ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader);
+  GetImportList(DestModule, Worklist, CalledFunctions,
+                ModuleToFunctionsToImportMap, Index, ModuleLoaderCache);
+  assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList");
+
+  // Do the actual import of functions now, one Module at a time
+  for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) {
+    // Get the module for the import
+    auto &FunctionsToImport = FunctionsToImportPerModule.second.second;
+    auto *SrcModule = FunctionsToImportPerModule.second.first;
+    assert(&DestModule.getContext() == &SrcModule->getContext() &&
+           "Context mismatch");
+
+    // Link in the specified functions.
+    if (TheLinker.linkInModule(*SrcModule, Linker::Flags::None, &Index,
+                               &FunctionsToImport))
+      report_fatal_error("Function Import: link error");
 
-  DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
+    ImportedCount += FunctionsToImport.size();
+  }
+  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
   return ImportedCount;
 }
@@ -256,30 +342,45 @@ getFunctionIndexForFile(StringRef Path, std::string &Error,
 
 /// Pass that performs cross-module function import provided a summary file.
 class FunctionImportPass : public ModulePass {
+  /// Optional function summary index to use for importing, otherwise
+  /// the summary-file option must be specified.
+  const FunctionInfoIndex *Index;
 
 public:
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit FunctionImportPass() : ModulePass(ID) {}
+  /// Specify pass name for debug output
+  const char *getPassName() const override {
+    return "Function Importing";
+  }
+
+  explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr)
+      : ModulePass(ID), Index(Index) {}
 
   bool runOnModule(Module &M) override {
-    if (SummaryFile.empty()) {
-      report_fatal_error("error: -function-import requires -summary-file\n");
-    }
-    std::string Error;
-    std::unique_ptr<FunctionInfoIndex> Index =
-        getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
-    if (!Index) {
-      errs() << "Error loading file '" << SummaryFile << "': " << Error << "\n";
-      return false;
+    if (SummaryFile.empty() && !Index)
+      report_fatal_error("error: -function-import requires -summary-file or "
+                         "file from frontend\n");
+    std::unique_ptr<FunctionInfoIndex> IndexPtr;
+    if (!SummaryFile.empty()) {
+      if (Index)
+        report_fatal_error("error: -summary-file and index from frontend\n");
+      std::string Error;
+      IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
+      if (!IndexPtr) {
+        errs() << "Error loading file '" << SummaryFile << "': " << Error
+               << "\n";
+        return false;
+      }
+      Index = IndexPtr.get();
     }
 
     // Perform the import now.
-    ModuleLazyLoaderCache Loader(M.getContext());
-    FunctionImporter Importer(*Index, diagnosticHandler,
-                              [&](StringRef Name)
-                                  -> Module &{ return Loader(Name); });
+    auto ModuleLoader = [&M](StringRef Identifier) {
+      return loadFile(Identifier, M.getContext());
+    };
+    FunctionImporter Importer(*Index, diagnosticHandler, ModuleLoader);
     return Importer.importFunctions(M);
 
     return false;
@@ -293,5 +394,7 @@ INITIALIZE_PASS_END(FunctionImportPass, "function-import",
                     "Summary Based Function Import", false, false)
 
 namespace llvm {
-Pass *createFunctionImportPass() { return new FunctionImportPass(); }
+Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) {
+  return new FunctionImportPass(Index);
+}
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index ec6f21e8c64f..b8d1b7e78e35 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -108,6 +109,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
+    FunctionIndex = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
@@ -476,6 +478,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
+  if (FunctionIndex)
+    PM.add(createFunctionImportPass(FunctionIndex));
+
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
   // pointers passed as arguments to direct uses of functions.
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 714e1d6e42d2..cd2411ba554c 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -191,9 +191,13 @@ bool PruneEH::SimplifyFunction(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
-        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+        SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        II->getOperandBundlesAsDefs(OpBundles);
+
         // Insert a call instruction before the invoke.
-        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                          "", II);
         Call->takeName(II);
         Call->setCallingConv(II->getCallingConv());
         Call->setAttributes(II->getAttributes());
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4afe1bb243ff..dcd86db036b4 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,6 +1715,42 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
+/// Given a bitcasted source operand fed into an extract element instruction and
+/// then bitcasted again to a scalar type, eliminate at least one bitcast by
+/// changing the vector type of the extractelement instruction.
+/// Example:
+///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
+///    --->
+///   extractelement <2 x float> %X, i32 1
+static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
+                                      const DataLayout &DL) {
+  Type *DestType = BitCast.getType();
+  if (DestType->isVectorTy())
+    return nullptr;
+
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  Value *InnerBitCast = nullptr;
+  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
+    return nullptr;
+
+  // If the source is not a vector or its element type doesn't match the result
+  // type, bitcast it to a vector type that we can extract from.
+  Type *SourceType = InnerBitCast->getType();
+  if (SourceType->getScalarType() != DestType) {
+    unsigned VecWidth = SourceType->getPrimitiveSizeInBits();
+    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+    unsigned NumElts = VecWidth / DestWidth;
+    SourceType = VectorType::get(DestType, NumElts);
+    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, SourceType, "bc");
+  }
+
+  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
+}
+
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1886,6 +1922,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
+  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
+    return I;
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 2baa131bc99c..776704d1efa9 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1070,7 +1070,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
     auto SPF = SPR.Flavor;
 
-    if (SPF) {
+    if (SelectPatternResult::isMinOrMax(SPF)) {
       // Canonicalize so that type casts are outside select patterns.
       if (LHS->getType()->getPrimitiveSizeInBits() !=
           SI.getType()->getPrimitiveSizeInBits()) {
@@ -1091,11 +1091,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
                                            SI.getType());
         return ReplaceInstUsesWith(SI, NewSI);
       }
+    }
 
+    if (SPF) {
       // MAX(MAX(a, b), a) -> MAX(a, b)
       // MIN(MIN(a, b), a) -> MIN(a, b)
       // MAX(MIN(a, b), a) -> a
       // MIN(MAX(a, b), a) -> a
+      // ABS(ABS(a)) -> ABS(a)
+      // NABS(NABS(a)) -> NABS(a)
       if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
         if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8eb82e39b8a6..a9df5e5898ae 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -280,6 +280,11 @@ class GlobalsMetadata {
 
   GlobalsMetadata() : inited_(false) {}
 
+  void reset() {
+    inited_ = false;
+    Entries.clear();
+  }
+
   void init(Module &M) {
     assert(!inited_);
     inited_ = true;
@@ -450,6 +455,7 @@ struct AddressSanitizer : public FunctionPass {
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
   bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
 
   DominatorTree &getDominatorTree() const { return *DT; }
@@ -624,9 +630,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
                                         Value *SavedStack) {
     IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
     IRB.CreateCall(AsanAllocasUnpoisonFunc,
-                   {IRB.CreateLoad(DynamicAllocaLayout),
-                    IRB.CreatePtrToInt(SavedStack, IntptrTy)});
+                   {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});
   }
 
   // Unpoison dynamic allocas redzones.
@@ -1506,6 +1527,11 @@ bool AddressSanitizer::doInitialization(Module &M) {
   return true;
 }
 
+bool AddressSanitizer::doFinalization(Module &M) {
+  GlobalsMD.reset();
+  return false;
+}
+
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
new file mode 100644
index 000000000000..c47fdbf68996
--- /dev/null
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -0,0 +1,217 @@
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+#define DEBUG_TYPE "cfgmst"
+
+/// \brief An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *BB = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // Add a fake edge to the entry.
+    addEdge(nullptr, BB, EntryWeight);
+
+    // Special handling for single BB functions.
+    if (succ_empty(BB)) {
+      addEdge(BB, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          addEdge(&*BB, TargetBB, Weight).IsCritical = Critical;
+          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                       << TargetBB->getName() << "  w=" << Weight << "\n");
+        }
+      } else {
+        addEdge(&*BB, nullptr, BBWeight);
+        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to exit"
+                     << " w = " << BBWeight << "\n");
+      }
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    std::stable_sort(AllEdges.begin(), AllEdges.end(),
+                     [](const std::unique_ptr<Edge> &Edge1,
+                        const std::unique_ptr<Edge> &Edge2) {
+                       return Edge1->Weight > Edge2->Weight;
+                     });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
+public:
+  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+         BlockFrequencyInfo *BFI_ = nullptr)
+      : F(Func), BPI(BPI_), BFI(BFI_) {
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
+  }
+};
+
+#undef DEBUG_TYPE // "cfgmst"
+} // end namespace llvm
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 9b81f4bb1619..cae1e5af7ac7 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMInstrumentation
   MemorySanitizer.cpp
   Instrumentation.cpp
   InstrProfiling.cpp
+  PGOInstrumentation.cpp
   SafeStack.cpp
   SanitizerCoverage.cpp
   ThreadSanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index c504b5a8199a..a05a5fa09f9a 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -60,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializePGOInstrumentationGenPass(Registry);
+  initializePGOInstrumentationUsePass(Registry);
   initializeInstrProfilingPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/lib/Transforms/Instrumentation/LLVMBuild.txt b/lib/Transforms/Instrumentation/LLVMBuild.txt
index 14c174332ee4..bcefe795c193 100644
--- a/lib/Transforms/Instrumentation/LLVMBuild.txt
+++ b/lib/Transforms/Instrumentation/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Instrumentation
 parent = Transforms
-required_libraries = Analysis Core MC Support TransformUtils
+required_libraries = Analysis Core MC Support TransformUtils ProfileData
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
new file mode 100644
index 000000000000..4b59b93b325f
--- /dev/null
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -0,0 +1,718 @@
+//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights.
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "CFGMST.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/JamCRC.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+
+namespace {
+class PGOInstrumentationGen : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGen() : ModulePass(ID) {
+    initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationGenPass";
+  }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUse : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUse(std::string Filename = "")
+      : ModulePass(ID), ProfileFileName(Filename) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationUsePass";
+  }
+
+private:
+  std::string ProfileFileName;
+  std::unique_ptr<IndexedInstrProfReader> PGOReader;
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOInstrumentationGen::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenPass() {
+  return new PGOInstrumentationGen();
+}
+
+char PGOInstrumentationUse::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) {
+  return new PGOInstrumentationUse(Filename.str());
+}
+
+namespace {
+/// \brief An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST;
+  bool Removed;
+  bool IsCritical;
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false),
+        IsCritical(false) {}
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+  void computeCFGHash();
+
+public:
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+  // CFG hash value for this function.
+  uint64_t FunctionHash;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                          Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false,
+                         BranchProbabilityInfo *BPI = nullptr,
+                         BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), FunctionHash(0), MST(F, BPI, BFI) {
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    DEBUG(dumpInfo("after CFGMST"));
+
+    NumOfPGOBB += MST.BBInfos.size();
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      NumOfPGOEdge++;
+      if (!E->InMST)
+        NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  };
+};
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<char> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const TerminatorInst *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Index = getBBInfo(Succ).Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((char)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+  FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  TerminatorInst *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return SrcBB;
+  if (!E->IsCritical)
+    return DestBB;
+
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  NumOfPGOSplit++;
+  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
+               << getBBInfo(DestBB).Index << "\n");
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  assert(InstrBB && "Critical edge is not split");
+
+  E->Removed = true;
+  return InstrBB;
+}
+
+// Visit all edge and instrument the edges not in MST.
+// Critical edges will be split.
+static void instrumentOneFunc(Function &F, Module *M,
+                              BranchProbabilityInfo *BPI,
+                              BlockFrequencyInfo *BFI) {
+  unsigned NumCounters = 0;
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI);
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (!E->InMST && !E->Removed)
+      NumCounters++;
+  }
+
+  uint32_t I = 0;
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get());
+    if (!InstrBB)
+      continue;
+
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+}
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid;
+  uint64_t CountValue;
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+typedef SmallVector<PGOUseEdge *, 2> DirectEdges;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue;
+  bool CountValid;
+  int32_t UnknownCountInEdge;
+  int32_t UnknownCountOutEdge;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+  UseBBInfo(unsigned IX)
+      : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+class PGOUseFunc {
+private:
+  Function &F;
+  Module *M;
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Find the Instrumented BB and set the value.
+  void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (ProgramMaxCount == 0)
+      return;
+    // Threshold of the hot functions.
+    const BranchProbability HotFunctionThreshold(1, 100);
+    // Threshold of the cold functions.
+    const BranchProbability ColdFunctionThreshold(2, 10000);
+    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::InlineHint);
+    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::Cold);
+  }
+
+public:
+  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
+             BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+};
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB.
+void PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<PGOUseEdge *> WorkList;
+  for (auto &E : FuncInfo.MST.AllEdges)
+    WorkList.push_back(E.get());
+
+  uint32_t I = 0;
+  for (auto &E : WorkList) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E);
+    if (!InstrBB)
+      continue;
+    uint64_t CountValue = CountFromProfile[I++];
+    if (!E->Removed) {
+      getBBInfo(InstrBB).setBBInfoCount(CountValue);
+      E->setEdgeCount(CountValue);
+      continue;
+    }
+
+    // Need to add two new edges.
+    BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+    BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+    // Add new edge of SrcBB->InstrBB.
+    PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0);
+    NewEdge.setEdgeCount(CountValue);
+    // Add new edge of InstrBB->DestBB.
+    PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0);
+    NewEdge1.setEdgeCount(CountValue);
+    NewEdge1.InMST = true;
+    getBBInfo(InstrBB).setBBInfoCount(CountValue);
+  }
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+  auto &Ctx = M->getContext();
+  ErrorOr<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (std::error_code EC = Result.getError()) {
+    if (EC == instrprof_error::unknown_function)
+      NumOfPGOMissing++;
+    else if (EC == instrprof_error::hash_mismatch ||
+             EC == llvm::instrprof_error::malformed)
+      NumOfPGOMismatch++;
+
+    std::string Msg = EC.message() + std::string(" ") + F.getName().str();
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    return false;
+  }
+  std::vector<uint64_t> &CountFromProfile = Result.get().Counts;
+
+  NumOfPGOFunc++;
+  DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
+  }
+
+  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  setInstrumentedCounts(CountFromProfile);
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount();
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  // First set up Count variable for all BBs.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed)
+      continue;
+
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+    UseBBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.OutEdges.push_back(E.get());
+    DestInfo.InEdges.push_back(E.get());
+    SrcInfo.UnknownCountOutEdge++;
+    DestInfo.UnknownCountInEdge++;
+
+    if (!E->CountValid)
+      continue;
+    DestInfo.UnknownCountInEdge--;
+    SrcInfo.UnknownCountOutEdge--;
+  }
+
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo &Count = getBBInfo(&BB);
+      if (!Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.OutEdges);
+          Count.CountValid = true;
+          Changes = true;
+        } else if (Count.UnknownCountInEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.InEdges);
+          Count.CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges);
+          setEdgeCount(Count.OutEdges, Total);
+          Changes = true;
+        }
+        if (Count.UnknownCountInEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges);
+          setEdgeCount(Count.InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+  // Assert every BB has a valid counter.
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    assert(getBBInfo(&BB).CountValid && "BB count is not valid");
+    uint64_t Count = getBBInfo(&BB).CountValue;
+    if (Count > FuncMaxCount)
+      FuncMaxCount = Count;
+  }
+  applyFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  DEBUG(dbgs() << "\nSetting branch weights.\n");
+  MDBuilder MDB(M->getContext());
+  for (auto &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<unsigned, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == 0)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    assert(MaxCount > 0 && "Bad max count");
+    uint64_t Scale = calculateCountScale(MaxCount);
+    SmallVector<unsigned, 4> Weights;
+    for (const auto &ECI : EdgeCounts)
+      Weights.push_back(scaleBranchCount(ECI, Scale));
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    DEBUG(dbgs() << "Weight is: ";
+          for (const auto &W : Weights) { dbgs() << W << " "; }
+          dbgs() << "\n";);
+  }
+}
+} // end anonymous namespace
+
+bool PGOInstrumentationGen::runOnModule(Module &M) {
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    instrumentOneFunc(F, &M, BPI, BFI);
+  }
+  return true;
+}
+
+static void setPGOCountOnFunc(PGOUseFunc &Func,
+                              IndexedInstrProfReader *PGOReader) {
+  if (Func.readCounters(PGOReader)) {
+    Func.populateCounters();
+    Func.setBranchWeights();
+  }
+}
+
+bool PGOInstrumentationUse::runOnModule(Module &M) {
+  DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message()));
+    return false;
+  }
+
+  PGOReader = std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          "Cannot get PGOReader"));
+    return false;
+  }
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    PGOUseFunc Func(F, &M, BPI, BFI);
+    setPGOCountOnFunc(Func, PGOReader.get());
+  }
+  return true;
+}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 36ad0a5f7b91..e6996ab97a8c 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -7,11 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a trivial dead store elimination that only considers
-// basic-block local redundant stores.
-//
-// FIXME: This should eventually be extended to be a post-dominator tree
-// traversal.  Doing so would be pretty trivial.
+// This file implements dead store elimination that considers redundant stores
+// within a basic-block as well as across basic blocks in a reverse CFG order.
 //
 //===----------------------------------------------------------------------===//
 
@@ -44,6 +41,13 @@ using namespace llvm;
 STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
+STATISTIC(NumNonLocalStores, "Number of non-local stores deleted");
+
+static cl::opt<bool> EnableNonLocalDSE("enable-nonlocal-dse", cl::init(false));
+
+/// MaxNonLocalAttempts is an arbitrary threshold that provides
+/// an early opportunitiy for bail out to control compile time.
+static const unsigned MaxNonLocalAttempts = 100;
 
 namespace {
   struct DSE : public FunctionPass {
@@ -80,6 +84,7 @@ namespace {
     bool runOnBasicBlock(BasicBlock &BB);
     bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
     bool HandleFree(CallInst *F);
+    bool handleNonLocalDependency(Instruction *Inst);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
                                SmallSetVector<Value *, 16> &DeadStackObjects,
@@ -485,6 +490,7 @@ static bool isPossibleSelfRead(Instruction *Inst,
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
   const DataLayout &DL = BB.getModule()->getDataLayout();
   bool MadeChange = false;
+  unsigned NumNonLocalAttempts = 0;
 
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
@@ -554,99 +560,101 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
     MemDepResult InstDep = MD->getDependency(Inst);
 
-    // Ignore any store where we can't find a local dependence.
-    // FIXME: cross-block DSE would be fun. :)
-    if (!InstDep.isDef() && !InstDep.isClobber())
-      continue;
-
-    // Figure out what location is being stored to.
-    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+    if (InstDep.isDef() || InstDep.isClobber()) {
+      // Figure out what location is being stored to.
+      MemoryLocation Loc = getLocForWrite(Inst, *AA);
 
-    // If we didn't get a useful location, fail.
-    if (!Loc.Ptr)
-      continue;
-
-    while (InstDep.isDef() || InstDep.isClobber()) {
-      // Get the memory clobbered by the instruction we depend on.  MemDep will
-      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
-      // end up depending on a may- or must-aliased load, then we can't optimize
-      // away the store and we bail out.  However, if we depend on on something
-      // that overwrites the memory location we *can* potentially optimize it.
-      //
-      // Find out what memory location the dependent instruction stores.
-      Instruction *DepWrite = InstDep.getInst();
-      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
-      // If we didn't get a useful location, or if it isn't a size, bail out.
-      if (!DepLoc.Ptr)
-        break;
+      // If we didn't get a useful location, fail.
+      if (!Loc.Ptr)
+        continue;
 
-      // If we find a write that is a) removable (i.e., non-volatile), b) is
-      // completely obliterated by the store to 'Loc', and c) which we know that
-      // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) &&
-          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
-        int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR =
-            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
-        if (OR == OverwriteComplete) {
-          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
-                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-
-          // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD, *TLI);
-          ++NumFastStores;
-          MadeChange = true;
-
-          // DeleteDeadInstruction can delete the current instruction in loop
-          // cases, reset BBI.
-          BBI = Inst->getIterator();
-          if (BBI != BB.begin())
-            --BBI;
+      while (InstDep.isDef() || InstDep.isClobber()) {
+        // Get the memory clobbered by the instruction we depend on.  MemDep
+        // will skip any instructions that 'Loc' clearly doesn't interact with.
+        // If we end up depending on a may- or must-aliased load, then we can't
+        // optimize away the store and we bail out.  However, if we depend on on
+        // something that overwrites the memory location we *can* potentially
+        // optimize it.
+        //
+        // Find out what memory location the dependent instruction stores.
+        Instruction *DepWrite = InstDep.getInst();
+        MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+        // If we didn't get a useful location, or if it isn't a size, bail out.
+        if (!DepLoc.Ptr)
           break;
-        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
-          // TODO: base this on the target vector size so that if the earlier
-          // store was too small to get vector writes anyway then its likely
-          // a good idea to shorten it
-          // Power of 2 vector writes are probably always a bad idea to optimize
-          // as any store/memset/memcpy is likely using vector instructions so
-          // shortening it to not vector size is likely to be slower
-          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
-          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
-          if (llvm::isPowerOf2_64(InstWriteOffset) ||
-              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-
-            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
-                  << *DepWrite << "\n  KILLER (offset "
-                  << InstWriteOffset << ", "
-                  << DepLoc.Size << ")"
-                  << *Inst << '\n');
-
-            Value* DepWriteLength = DepIntrinsic->getLength();
-            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
-                                                    InstWriteOffset -
-                                                    DepWriteOffset);
-            DepIntrinsic->setLength(TrimmedLength);
+
+        // If we find a write that is a) removable (i.e., non-volatile), b) is
+        // completely obliterated by the store to 'Loc', and c) which we know
+        // that 'Inst' doesn't load from, then we can remove it.
+        if (isRemovable(DepWrite) &&
+            !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+          int64_t InstWriteOffset, DepWriteOffset;
+          OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI,
+                                           DepWriteOffset, InstWriteOffset);
+          if (OR == OverwriteComplete) {
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite
+                         << "\n  KILLER: " << *Inst << '\n');
+
+            // Delete the store and now-dead instructions that feed it.
+            DeleteDeadInstruction(DepWrite, *MD, *TLI);
+            ++NumFastStores;
             MadeChange = true;
+
+            // DeleteDeadInstruction can delete the current instruction in loop
+            // cases, reset BBI.
+            BBI = Inst->getIterator();
+            if (BBI != BB.begin())
+              --BBI;
+            break;
+          } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+            // TODO: base this on the target vector size so that if the earlier
+            // store was too small to get vector writes anyway then its likely a
+            // good idea to shorten it.
+
+            // Power of 2 vector writes are probably always a bad idea to
+            // optimize as any store/memset/memcpy is likely using vector
+            // instructions so shortening it to not vector size is likely to be
+            // slower.
+            MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+            unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+            if (llvm::isPowerOf2_64(InstWriteOffset) ||
+                ((DepWriteAlign != 0) &&
+                 InstWriteOffset % DepWriteAlign == 0)) {
+
+              DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: " << *DepWrite
+                           << "\n  KILLER (offset " << InstWriteOffset << ", "
+                           << DepLoc.Size << ")" << *Inst << '\n');
+
+              Value *DepWriteLength = DepIntrinsic->getLength();
+              Value *TrimmedLength = ConstantInt::get(
+                  DepWriteLength->getType(), InstWriteOffset - DepWriteOffset);
+              DepIntrinsic->setLength(TrimmedLength);
+              MadeChange = true;
+            }
           }
         }
-      }
 
-      // If this is a may-aliased store that is clobbering the store value, we
-      // can keep searching past it for another must-aliased pointer that stores
-      // to the same location.  For example, in:
-      //   store -> P
-      //   store -> Q
-      //   store -> P
-      // we can remove the first store to P even though we don't know if P and Q
-      // alias.
-      if (DepWrite == &BB.front()) break;
-
-      // Can't look past this instruction if it might read 'Loc'.
-      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
-        break;
+        // If this is a may-aliased store that is clobbering the store value, we
+        // can keep searching past it for another must-aliased pointer that
+        // stores to the same location.  For example, in
+        //   store -> P
+        //   store -> Q
+        //   store -> P
+        // we can remove the first store to P even though we don't know if P and
+        // Q alias.
+        if (DepWrite == &BB.front())
+          break;
+
+        // Can't look past this instruction if it might read 'Loc'.
+        if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+          break;
 
-      InstDep = MD->getPointerDependencyFrom(Loc, false,
-                                             DepWrite->getIterator(), &BB);
+        InstDep = MD->getPointerDependencyFrom(Loc, false,
+                                               DepWrite->getIterator(), &BB);
+      }
+    } else if (EnableNonLocalDSE && InstDep.isNonLocal()) { // DSE across BB
+      if (++NumNonLocalAttempts < MaxNonLocalAttempts)
+        MadeChange |= handleNonLocalDependency(Inst);
     }
   }
 
@@ -658,6 +666,147 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
+/// A helper for handleNonLocalDependency() function to find all blocks
+/// that lead to the input block BB and append them to the output PredBlocks.
+/// PredBlocks will include not only predecessors of BB that unconditionally
+/// lead to BB but also:
+///   - single-block loops that lead to BB, and
+///   - if-blocks for which one edge goes to BB and the other edge goes to
+///     a block in the input SafeBlocks.
+/// PredBlocks will not include blocks unreachable from the entry block, nor
+/// blocks that form cycles with BB.
+static void findSafePreds(SmallVectorImpl<BasicBlock *> &PredBlocks,
+                          SmallSetVector<BasicBlock *, 8> &SafeBlocks,
+                          BasicBlock *BB, DominatorTree *DT) {
+  for (auto *Pred : predecessors(BB)) {
+    if (Pred == BB)
+      continue;
+    // The second check below prevents adding blocks that form a cycle with BB
+    // in order to avoid potential problems due to MemoryDependenceAnalysis,
+    // isOverwrite, etc. being not loop-aware.
+    if (!DT->isReachableFromEntry(Pred) || DT->dominates(BB, Pred))
+      continue;
+
+    bool PredIsSafe = true;
+    for (auto *Succ : successors(Pred)) {
+      if (Succ == BB || Succ == Pred) // shortcut, BB should be in SafeBlocks
+        continue;
+      if (!SafeBlocks.count(Succ)) {
+        PredIsSafe = false;
+        break;
+      }
+    }
+    if (PredIsSafe)
+      PredBlocks.push_back(Pred);
+  }
+}
+
+static bool underlyingObjectsDoNotAlias(StoreInst *SI, LoadInst *LI,
+                                        const DataLayout &DL,
+                                        AliasAnalysis &AA) {
+  Value *AObj = GetUnderlyingObject(SI->getPointerOperand(), DL);
+  SmallVector<Value *, 4> Pointers;
+  GetUnderlyingObjects(LI->getPointerOperand(), Pointers, DL);
+
+  for (auto *BObj : Pointers) {
+    if (!AA.isNoAlias(AObj, DL.getTypeStoreSize(AObj->getType()), BObj,
+                      DL.getTypeStoreSize(BObj->getType())))
+      return false;
+  }
+  return true;
+}
+
+/// handleNonLocalDependency - Handle a non-local dependency on
+/// the input instruction Inst located in BB in attempt to remove
+/// redundant stores outside BB.
+bool DSE::handleNonLocalDependency(Instruction *Inst) {
+  auto *SI = dyn_cast<StoreInst>(Inst);
+  if (!SI)
+    return false;
+  // Get the location being stored to.
+  // If we don't get a useful location, bail out.
+  MemoryLocation Loc = getLocForWrite(SI, *AA);
+  if (!Loc.Ptr)
+    return false;
+
+  bool MadeChange = false;
+  BasicBlock *BB = Inst->getParent();
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+
+  // Worklist of predecessor blocks of BB
+  SmallVector<BasicBlock *, 8> Blocks;
+  // Keep track of all predecessor blocks that are safe to search through
+  SmallSetVector<BasicBlock *, 8> SafeBlocks;
+  SafeBlocks.insert(BB);
+  findSafePreds(Blocks, SafeBlocks, BB, DT);
+
+  while (!Blocks.empty()) {
+    BasicBlock *PB = Blocks.pop_back_val();
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, PB->end(), PB, SI);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+
+      // Filter out false dependency from a load to SI looking through phis.
+      if (auto *LI = dyn_cast<LoadInst>(Dependency)) {
+        if (underlyingObjectsDoNotAlias(SI, LI, DL, *AA)) {
+          Dep = MD->getPointerDependencyFrom(Loc, false,
+                                             Dependency->getIterator(), PB, SI);
+          continue;
+        }
+      }
+
+      // If we don't get a useful location for the dependent instruction,
+      // it doesn't write memory, it is not removable, or it might read Loc,
+      // then bail out.
+      MemoryLocation DepLoc = getLocForWrite(Dependency, *AA);
+      if (!DepLoc.Ptr || !hasMemoryWrite(Dependency, *TLI) ||
+          !isRemovable(Dependency) ||
+          (AA->getModRefInfo(Dependency, Loc) & MRI_Ref))
+        break;
+
+      // Don't remove a store within single-block loops;
+      // we need more analysis: e.g. looking for an interferring load
+      // above the store within the loop, etc.
+      bool SingleBlockLoop = false;
+      for (auto I = succ_begin(PB), E = succ_end(PB); I != E; ++I) {
+        BasicBlock *Succ = *I;
+        if (Succ == PB) {
+          SingleBlockLoop = true;
+          break;
+        }
+      }
+      if (SingleBlockLoop)
+        break;
+
+      int64_t InstWriteOffset, DepWriteOffset;
+      OverwriteResult OR =
+          isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
+      if (OR == OverwriteComplete) {
+        DEBUG(dbgs() << "DSE: Remove Non-Local Dead Store:\n  DEAD: "
+                     << *Dependency << "\n  KILLER: " << *SI << '\n');
+
+        // Delete redundant store and now-dead instructions that feed it.
+        auto Next = std::next(Dependency->getIterator());
+        DeleteDeadInstruction(Dependency, *MD, *TLI);
+        ++NumNonLocalStores;
+        MadeChange = true;
+        Dep = MD->getPointerDependencyFrom(Loc, false, Next, PB, SI);
+        continue;
+      }
+      // TODO: attempt shortening of Dependency inst as in the local case
+      break;
+    }
+
+    if (Dep.isNonLocal()) {
+      SafeBlocks.insert(PB);
+      findSafePreds(Blocks, SafeBlocks, PB, DT);
+    }
+  }
+
+  return MadeChange;
+}
+
 /// Returns true if the memory which is accessed by the second instruction is not
 /// modified between the first and the second instruction.
 /// Precondition: Second instruction must be dominated by the first
@@ -706,10 +855,10 @@ bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
     if (B != FirstBB) {
       assert(B != &FirstBB->getParent()->getEntryBlock() &&
           "Should not hit the entry block because SI must be dominated by LI");
-      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
-        if (!Visited.insert(*PredI).second)
+      for (auto *PredI : predecessors(B)) {
+        if (!Visited.insert(PredI).second)
           continue;
-        WorkList.push_back(*PredI);
+        WorkList.push_back(PredI);
       }
     }
   }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index de539d53a4f5..6fa194e57092 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -281,21 +281,31 @@ class EarlyCSE {
   /// that dominated values can succeed in their lookup.
   ScopedHTType AvailableValues;
 
-  /// \brief A scoped hash table of the current values of loads.
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
   ///
-  /// This allows us to get efficient access to dominating loads when we have
-  /// a fully redundant load.  In addition to the most recent load, we keep
-  /// track of a generation count of the read, which is compared against the
-  /// current generation count.  The current generation count is incremented
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
   /// after every possibly writing memory operation, which ensures that we only
-  /// CSE loads with other loads that have no intervening store.
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
   struct LoadValue {
     Value *Data;
     unsigned Generation;
     int MatchingId;
-    LoadValue() : Data(nullptr), Generation(0), MatchingId(-1) {}
-    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId)
-        : Data(Data), Generation(Generation), MatchingId(MatchingId) {}
+    bool IsAtomic;
+    LoadValue()
+      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+      : Data(Data), Generation(Generation), MatchingId(MatchingId),
+        IsAtomic(IsAtomic) {}
   };
   typedef RecyclingAllocator<BumpPtrAllocator,
                              ScopedHashTableVal<Value *, LoadValue>>
@@ -388,57 +398,100 @@ class EarlyCSE {
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-        : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
-          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
-      MayReadFromMemory = Inst->mayReadFromMemory();
-      MayWriteToMemory = Inst->mayWriteToMemory();
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-        MemIntrinsicInfo Info;
-        if (!TTI.getTgtMemIntrinsic(II, Info))
-          return;
-        if (Info.NumMemRefs == 1) {
-          Store = Info.WriteMem;
-          Load = Info.ReadMem;
-          MatchingId = Info.MatchingId;
-          MayReadFromMemory = Info.ReadMem;
-          MayWriteToMemory = Info.WriteMem;
-          Vol = Info.Vol;
-          Ptr = Info.PtrVal;
-        }
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Load = true;
-        Vol = !LI->isSimple();
-        Ptr = LI->getPointerOperand();
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isSimple() const {
+      if (IsTargetMemInst) return Info.IsSimple;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isSimple();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isSimple();
+      }
+      return Inst->isAtomic();
+    }
+    bool isAtomic() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        Store = true;
-        Vol = !SI->isSimple();
-        Ptr = SI->getPointerOperand();
+        return SI->isVolatile();
       }
+      // Conservative answer
+      return true;
     }
-    bool isLoad() const { return Load; }
-    bool isStore() const { return Store; }
-    bool isVolatile() const { return Vol; }
+
+    
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
-      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
     }
-    bool isValid() const { return Ptr != nullptr; }
-    int getMatchingId() const { return MatchingId; }
-    Value *getPtr() const { return Ptr; }
-    bool mayReadFromMemory() const { return MayReadFromMemory; }
-    bool mayWriteToMemory() const { return MayWriteToMemory; }
+    bool isValid() const { return getPointerOperand() != nullptr; }
 
-  private:
-    bool Load;
-    bool Store;
-    bool Vol;
-    bool MayReadFromMemory;
-    bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
     // intrinsic loads/stores, the id is retrieved from the corresponding
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
-    int MatchingId;
-    Value *Ptr;
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return Inst->mayReadFromMemory();
+    }
+    bool mayWriteToMemory() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
   };
 
   bool processNode(DomTreeNode *Node);
@@ -554,20 +607,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     ParseMemoryInst MemInst(Inst, TTI);
     // If this is a non-volatile load, process it.
     if (MemInst.isValid() && MemInst.isLoad()) {
-      // Ignore volatile loads.
-      if (MemInst.isVolatile()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
         LastStore = nullptr;
-        // Don't CSE across synchronization boundaries.
-        if (Inst->mayWriteToMemory())
-          ++CurrentGeneration;
-        continue;
+        ++CurrentGeneration;
       }
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPtr());
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
-          InVal.MatchingId == MemInst.getMatchingId()) {
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic()) {
         Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
         if (Op != nullptr) {
           DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
@@ -583,8 +638,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // Otherwise, remember that we have this instruction.
       AvailableLoads.insert(
-          MemInst.getPtr(),
-          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
+          MemInst.getPointerOperand(),
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic()));
       LastStore = nullptr;
       continue;
     }
@@ -639,9 +695,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       if (MemInst.isValid() && MemInst.isStore()) {
         // We do a trivial form of DSE if there are two stores to the same
-        // location with no intervening loads.  Delete the earlier store.
+        // location with no intervening loads.  Delete the earlier store. Note
+        // that we can delete an earlier simple store even if the following one
+        // is ordered/volatile/atomic store.
         if (LastStore) {
           ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isSimple() && "Violated invariant");
           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
             DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
                          << "  due to: " << *Inst << '\n');
@@ -659,12 +718,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
         AvailableLoads.insert(
-            MemInst.getPtr(),
-            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
-
-        // Remember that this was the last store we saw for DSE.
-        if (!MemInst.isVolatile())
+            MemInst.getPointerOperand(),
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic()));
+
+        // Remember that this was the last normal store we saw for DSE.
+        // Note that we can't delete an earlier atomic or volatile store in
+        // favor of a later one which isn't.  We could in principle remove an
+        // earlier unordered store if the later one is also unordered.
+        if (MemInst.isSimple())
           LastStore = Inst;
+        else
+          LastStore = nullptr;
       }
     }
   }
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index a9a4a24f8e93..7f5d78656b50 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -131,6 +131,8 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 // integer domain.
 void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
   for (auto &I : instructions(F)) {
+    if (isa<VectorType>(I.getType()))
+      continue;
     switch (I.getOpcode()) {
     default: break;
     case Instruction::FPToUI:
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 4ea92df9924f..c3db22dba56f 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
@@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
 /// loop. For PHI nodes, there may be multiple uses, so compute the nearest
 /// common dominator for the incoming blocks.
 static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT, LoopInfo *LI) {
   PHINode *PHI = dyn_cast<PHINode>(User);
   if (!PHI)
     return User;
@@ -234,10 +235,21 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
     InsertPt = InsertBB->getTerminator();
   }
   assert(InsertPt && "Missing phi operand");
-  assert((!isa<Instruction>(Def) ||
-          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
-         "def does not dominate all uses");
-  return InsertPt;
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -528,8 +540,8 @@ Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
 /// able to brute-force evaluate arbitrary instructions as long as they have
 /// constant operands at the beginning of the loop.
 void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
-  // Verify the input to the pass in already in LCSSA form.
-  assert(L->isLCSSAForm(*DT));
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -747,14 +759,9 @@ bool IndVarSimplify::canLoopBeDeleted(
     ++BI;
   }
 
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE;
-         ++BI) {
-      if (BI->mayHaveSideEffects())
-        return false;
-    }
-  }
+  for (auto *BB : L->blocks())
+    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
 
   return true;
 }
@@ -1167,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
 
 /// This IV user cannot be widen. Replace this use of the original narrow IV
 /// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
   DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
         << " for user " << *DU.NarrowUse << "\n");
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
@@ -1207,7 +1215,8 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
   assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
 
   // Widen the compare instruction.
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
 
   // Widen the other operand of the compare, if necessary.
@@ -1229,7 +1238,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
       // After SimplifyCFG most loop exit targets have a single predecessor.
       // Otherwise fall back to a truncate within the loop.
       if (UsePhi->getNumOperands() != 1)
-        truncateIVUse(DU, DT);
+        truncateIVUse(DU, DT, LI);
       else {
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
@@ -1297,7 +1306,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    truncateIVUse(DU, DT);
+    truncateIVUse(DU, DT, LI);
     return nullptr;
   }
   // Assume block terminators cannot evaluate to a recurrence. We can't to
@@ -1661,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
     return false;
 
   // Optimistically handle other instructions.
-  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
-    if (!Visited.insert(*OI).second)
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
       continue;
-    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
       return false;
   }
   return true;
@@ -2165,9 +2174,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
   // Check a post-condition.
-  assert(L->isLCSSAForm(*DT) &&
-         "Indvars did not leave the loop in lcssa form!");
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 67ebd2532b16..fce063ab40a0 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -761,7 +761,7 @@ class LoopDistribute : public FunctionPass {
     }
 
     // Don't distribute the loop if we need too many SCEV run-time checks.
-    const SCEVUnionPredicate &Pred = LAI.Preds;
+    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
     if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
@@ -790,7 +790,7 @@ class LoopDistribute : public FunctionPass {
       DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI.Preds);
+      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LVer.versionLoop(DefsUsedOutside);
     }
 
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 8233bdcb6804..6026a907ff81 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     }
   }
 
-  // We don't have a DepMatrix to check legality return false
+  // We don't have a DepMatrix to check legality return false.
   if (DepMatrix.size() == 0)
     return false;
   return true;
@@ -371,7 +371,7 @@ class LoopInterchangeProfitability {
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
 
-  /// Check if the loop interchange is profitable
+  /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix);
 
@@ -385,7 +385,7 @@ class LoopInterchangeProfitability {
   ScalarEvolution *SE;
 };
 
-/// LoopInterchangeTransform interchanges the loop
+/// LoopInterchangeTransform interchanges the loop.
 class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
@@ -424,7 +424,7 @@ class LoopInterchangeTransform {
   bool InnerLoopHasReduction;
 };
 
-// Main LoopInterchange Pass
+// Main LoopInterchange Pass.
 struct LoopInterchange : public FunctionPass {
   static char ID;
   ScalarEvolution *SE;
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 7c7bf64ba79c..c5bce7ae29da 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -61,7 +61,7 @@ struct StoreToLoadForwardingCandidate {
 
   /// \brief Return true if the dependence from the store to the load has a
   /// distance of one.  E.g. A[i+1] = A[i]
-  bool isDependenceDistanceOfOne(ScalarEvolution *SE) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadPtrType = LoadPtr->getType();
@@ -75,13 +75,13 @@ struct StoreToLoadForwardingCandidate {
     auto &DL = Load->getParent()->getModule()->getDataLayout();
     unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
 
-    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
-    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
 
     // We don't need to check non-wrapping here because forward/backward
     // dependence wouldn't be valid if these weren't monotonic accesses.
-    auto *Dist =
-        cast<SCEVConstant>(SE->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
     const APInt &Val = Dist->getValue()->getValue();
     return Val.abs() == TypeByteSize;
   }
@@ -114,8 +114,8 @@ bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
 class LoadEliminationForLoop {
 public:
   LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
-                         DominatorTree *DT, ScalarEvolution *SE)
-      : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {}
+                         DominatorTree *DT)
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
 
   /// \brief Look through the loop-carried and loop-independent dependences in
   /// this loop and find store->load dependences.
@@ -223,8 +223,8 @@ class LoadEliminationForLoop {
         // block so deciding which one forwards is easy.  The later one forwards
         // as long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(SE) &&
-            OtherCand->isDependenceDistanceOfOne(SE)) {
+            Cand.isDependenceDistanceOfOne(PSE) &&
+            OtherCand->isDependenceDistanceOfOne(PSE)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -372,7 +372,7 @@ class LoadEliminationForLoop {
     //      store %y, %gep_i_plus_1
 
     Value *Ptr = Cand.Load->getPointerOperand();
-    auto *PtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
     auto *PH = L->getLoopPreheader();
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
@@ -436,7 +436,7 @@ class LoadEliminationForLoop {
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(SE))
+      if (!Cand.isDependenceDistanceOfOne(PSE))
         continue;
 
       ++NumForwarding;
@@ -459,23 +459,24 @@ class LoadEliminationForLoop {
       return false;
     }
 
-    if (LAI.Preds.getComplexity() > LoadElimSCEVCheckThreshold) {
+    if (LAI.PSE.getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
     // Point of no-return, start the transformation.  First, version the loop if
     // necessary.
-    if (!Checks.empty() || !LAI.Preds.isAlwaysTrue()) {
-      LoopVersioning LV(LAI, L, LI, DT, SE, false);
+    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
       LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.Preds);
+      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LV.versionLoop();
     }
 
     // Next, propagate the value stored by the store to the users of the load.
     // Also for the first iteration, generate the initial value of the load.
-    SCEVExpander SEE(*SE, L->getHeader()->getModule()->getDataLayout(),
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
                      "storeforward");
     for (const auto &Cand : Candidates)
       propagateStoredValueToLoadUsers(Cand, SEE);
@@ -495,7 +496,7 @@ class LoadEliminationForLoop {
   LoopInfo *LI;
   const LoopAccessInfo &LAI;
   DominatorTree *DT;
-  ScalarEvolution *SE;
+  PredicatedScalarEvolution PSE;
 };
 
 /// \brief The pass.  Most of the work is delegated to the per-loop
@@ -510,7 +511,6 @@ class LoopLoadElimination : public FunctionPass {
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *LAA = &getAnalysis<LoopAccessAnalysis>();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
     // Build up a worklist of inner-loops to vectorize. This is necessary as the
     // act of distributing a loop creates new loops and can invalidate iterators
@@ -528,7 +528,7 @@ class LoopLoadElimination : public FunctionPass {
     for (Loop *L : Worklist) {
       const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
       // The actual work is performed by LoadEliminationForLoop.
-      LoadEliminationForLoop LEL(L, LI, LAI, DT, SE);
+      LoadEliminationForLoop LEL(L, LI, LAI, DT);
       Changed |= LEL.processLoop();
     }
 
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 465fc34f6f1d..854a3b855f54 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -142,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     if (BB.hasAddressTaken()) {
       Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
                                               const_cast<BasicBlock*>(&BB));
-      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                         
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
     }
 
     // Note return instructions for the caller.
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index acb88c2395fe..ab083353ece6 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -20,27 +20,28 @@
 #include "llvm-c/Core.h"
 using namespace llvm;
 
-/// CloneModule - Return an exact copy of the specified module.  This is not as
-/// easy as it might seem because we have to worry about making copies of global
-/// variables and functions, and making their (initializers and references,
-/// respectively) refer to the right globals.
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
 ///
-Module *llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
   // Create the value map that maps things from the old module over to the new
   // module.
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap);
 }
 
-Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+                                          ValueToValueMapTy &VMap) {
   return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
 }
 
-Module *llvm::CloneModule(
+std::unique_ptr<Module> llvm::CloneModule(
     const Module *M, ValueToValueMapTy &VMap,
     std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
-  Module *New = new Module(M->getModuleIdentifier(), M->getContext());
+  std::unique_ptr<Module> New =
+      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
   New->setDataLayout(M->getDataLayout());
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
@@ -65,8 +66,8 @@ Module *llvm::CloneModule(
   // Loop over the functions in the module, making external functions as before
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
     Function *NF =
-      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                       I->getLinkage(), I->getName(), New);
+        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                         I->getLinkage(), I->getName(), New.get());
     NF->copyAttributesFrom(&*I);
     VMap[&*I] = NF;
   }
@@ -82,7 +83,8 @@ Module *llvm::CloneModule(
       GlobalValue *GV;
       if (I->getValueType()->isFunctionTy())
         GV = Function::Create(cast<FunctionType>(I->getValueType()),
-                              GlobalValue::ExternalLinkage, I->getName(), New);
+                              GlobalValue::ExternalLinkage, I->getName(),
+                              New.get());
       else
         GV = new GlobalVariable(
             *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
@@ -96,7 +98,7 @@ Module *llvm::CloneModule(
     }
     auto *GA = GlobalAlias::create(I->getValueType(),
                                    I->getType()->getPointerAddressSpace(),
-                                   I->getLinkage(), I->getName(), New);
+                                   I->getLinkage(), I->getName(), New.get());
     GA->copyAttributesFrom(&*I);
     VMap[&*I] = GA;
   }
@@ -168,7 +170,7 @@ Module *llvm::CloneModule(
 extern "C" {
 
 LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
-  return wrap(CloneModule(unwrap(M)));
+  return wrap(CloneModule(unwrap(M)).release());
 }
 
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index aee84c07d593..cafd1818fed6 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -206,11 +206,10 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
     BB->getInstList().pop_back();
 
     // Create the new invoke instruction.
-    ImmutableCallSite CS(CI);
-    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
+    SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
     SmallVector<OperandBundleDef, 1> OpBundles;
 
-    CS.getOperandBundlesAsDefs(OpBundles);
+    CI->getOperandBundlesAsDefs(OpBundles);
 
     // Note: we're round tripping operand bundles through memory here, and that
     // can potentially be avoided with a cleverer API design that we do not have
@@ -1162,7 +1161,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       SmallVector<OperandBundleDef, 2> OpDefs;
 
       for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
-        Instruction *I = VH;
+        if (!VH) continue;  // instruction was DCE'd after being cloned
+
+        Instruction *I = cast<Instruction>(VH);
 
         OpDefs.clear();
 
@@ -1191,7 +1192,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(),
                                  ChildOB.Inputs.end());
 
-          OpDefs.emplace_back(StringRef("deopt"), std::move(MergedDeoptArgs));
+          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
 
         Instruction *NewI = nullptr;
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ba79b32ac3d5..391ed6857668 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1210,8 +1210,11 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// changeToCall - Convert the specified invoke into a normal call.
 static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  II->getOperandBundlesAsDefs(OpBundles);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                       "", II);
   NewCall->takeName(II);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index ad4c388e4066..44dde1b51cfc 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -221,6 +221,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  Loop *ParentL = L->getParentLoop();
+  bool AllExitsAreInsideParentLoop = !ParentL ||
+      std::all_of(ExitBlocks.begin(), ExitBlocks.end(),
+                  [&](BasicBlock *BB) { return ParentL->contains(BB); });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
@@ -544,7 +550,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      simplifyLoop(OuterL, DT, LI, PP, SE, AC);
+      bool Simplified = simplifyLoop(OuterL, DT, LI, PP, SE, AC);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
@@ -554,7 +560,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      formLCSSARecursively(*OuterL, *DT, LI, SE);
+      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified))
+        formLCSSARecursively(*OuterL, *DT, LI, SE);
+      else
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
     }
   }
 
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index cc3ff5d80d42..9a2a06cf6891 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,7 +32,7 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.Preds);
+    setSCEVChecks(LAI.PSE.getUnionPredicate());
   }
 }
 
@@ -58,7 +58,7 @@ void LoopVersioning::versionLoop(
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
   assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.Preds;
+  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index c5abf76d93fa..9ec28a3f3d47 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array,
     }
     GVCtor->eraseFromParent();
   } else {
-    // Use a simple two-field struct if there isn't one already.
+    // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            nullptr);
+                            IRB.getInt8PtrTy(), nullptr);
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 83afb1a65ac0..df75ed96893d 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1058,6 +1058,31 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
+  // Multiplications calculated using Addition Chains.
+  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+  assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+  if (InnerChain[Exp])
+    return InnerChain[Exp];
+
+  static const unsigned AddChain[33][2] = {
+      {0, 0}, // Unused.
+      {0, 0}, // Unused (base case = pow1).
+      {1, 1}, // Unused (pre-computed).
+      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4},
+      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7},
+      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10},
+      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+  };
+
+  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+                                 getPow(InnerChain, AddChain[Exp][1], B));
+  return InnerChain[Exp];
+}
+
 Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
@@ -1156,6 +1181,32 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFMul(Op1, Op1, "pow2");
   if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+
+  // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
+  if (unsafeFPMath) {
+    APFloat V = abs(Op2C->getValueAPF());
+    // We limit to a max of 7 fmul(s). Thus max exponent is 32.
+    // This transformation applies to integer exponents only.
+    if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan ||
+        !V.isInteger())
+      return nullptr;
+
+    // We will memoize intermediate products of the Addition Chain.
+    Value *InnerChain[33] = {nullptr};
+    InnerChain[1] = Op1;
+    InnerChain[2] = B.CreateFMul(Op1, Op1);
+
+    // We cannot readily convert a non-double type (like float) to a double.
+    // So we first convert V to something which could be converted to double.
+    bool ignored;
+    V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
+    // For negative exponents simply compute the reciprocal.
+    if (Op2C->isNegative())
+      FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul);
+    return FMul;
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c5b8b5b073d6..9adc80c8bd0f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -310,15 +310,16 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
 /// and reduction variables that were found to a given vectorization factor.
 class InnerLoopVectorizer {
 public:
-  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, const TargetLibraryInfo *TLI,
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor, SCEVUnionPredicate &Preds)
-      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false), Preds(Preds) {}
+        AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
@@ -486,8 +487,10 @@ class InnerLoopVectorizer {
 
   /// The original loop.
   Loop *OrigLoop;
-  /// Scev analysis to use.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info.
   LoopInfo *LI;
   /// Dominator Tree.
@@ -551,23 +554,15 @@ class InnerLoopVectorizer {
 
   // Record whether runtime check is added.
   bool AddedSafetyChecks;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify existing expressions in the
-  /// context of existing SCEV assumptions. Since legality checking is
-  /// not done here, we don't need to use this predicate to record
-  /// further assumptions.
-  SCEVUnionPredicate &Preds;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
-  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor,
-                    SCEVUnionPredicate &Preds)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
-                            Preds) {}
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -789,9 +784,9 @@ class InterleaveGroup {
 /// between the member and the group in a map.
 class InterleavedAccessInfo {
 public:
-  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
-                        SCEVUnionPredicate &Preds)
-      : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
+  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+                        DominatorTree *DT)
+      : PSE(PSE), TheLoop(L), DT(DT) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -821,17 +816,14 @@ class InterleavedAccessInfo {
   }
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+  /// The interleaved access analysis can also add new predicates (for example
+  /// by versioning strides of pointers).
+  PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
 
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The interleaved access
-  /// analysis can also add new predicates (for example by versioning
-  /// strides of pointers).
-  SCEVUnionPredicate &Preds;
-
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
@@ -1189,18 +1181,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                            Function *F, const TargetTransformInfo *TTI,
+  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            AliasAnalysis *AA, Function *F,
+                            const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H,
-                            SCEVUnionPredicate &Preds)
-      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
-        InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
-        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
-        Preds(Preds) {}
+                            const LoopVectorizeHints *H)
+      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1347,8 +1338,12 @@ class LoopVectorizationLegality {
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -1403,13 +1398,6 @@ class LoopVectorizationLegality {
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The analysis will also
-  /// add a minimal set of new predicates if this is required to
-  /// enable vectorization/unrolling.
-  SCEVUnionPredicate &Preds;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1427,8 +1415,7 @@ class LoopVectorizationCostModel {
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore,
-                             SCEVUnionPredicate &Preds)
+                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
 
@@ -1758,12 +1745,12 @@ struct LoopVectorize : public FunctionPass {
       }
     }
 
-    SCEVUnionPredicate Preds;
+    PredicatedScalarEvolution PSE(*SE);
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints, Preds);
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+                                  &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1781,8 +1768,8 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
-                                  ValuesToIgnore, Preds);
+    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
+                                  F, &Hints, ValuesToIgnore);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1893,7 +1880,7 @@ struct LoopVectorize : public FunctionPass {
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop then
       // interleave it.
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
+      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
       Unroller.vectorize(&LVL, CM.MinBWs);
 
       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@@ -1901,7 +1888,7 @@ struct LoopVectorize : public FunctionPass {
                                  Twine(IC) + ")");
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
       LB.vectorize(&LVL, CM.MinBWs);
       ++LoopsVectorized;
 
@@ -2002,6 +1989,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+  auto *SE = PSE.getSE();
   // Make sure that the pointer does not point to structs.
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
@@ -2031,7 +2019,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
     // Make sure that all of the index operands are loop invariant.
     for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
         return 0;
 
     InductionDescriptor II = Inductions[Phi];
@@ -2044,14 +2032,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // operand.
   for (unsigned i = 0; i != NumOperands; ++i)
     if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
-    Last = SE->getSCEV(Gep->getOperand(InductionOperand));
+    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
     // We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -2062,7 +2050,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
+    Last = replaceSymbolicStrideSCEV(PSE, Strides,
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2420,8 +2408,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
     setDebugLocFromInst(Builder, Gep);
-    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
-                               OrigLoop) && "Base ptr must be invariant");
+    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                        OrigLoop) &&
+           "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
@@ -2438,7 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
         assert((i == InductionOperand ||
-               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+                                             OrigLoop)) &&
                "Must be last index or loop invariant");
 
         VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2658,6 +2648,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
          "Invalid loop count");
@@ -2765,8 +2756,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   // Generate the code to check that the SCEV assumptions that we made.
   // We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
-  SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
-  Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck =
+      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
 
   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
     if (C->isZero())
@@ -3785,8 +3778,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
-                                               OrigLoop);
+      auto *SE = PSE.getSE();
+      bool InvariantCond =
+          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
       setDebugLocFromInst(Builder, &*it);
 
       // The condition can be loop invariant  but still defined inside the
@@ -3967,7 +3961,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
-  SE->forgetLoop(OrigLoop);
+  PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -4119,10 +4113,10 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(VectorizationReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(VectorizationReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -4162,7 +4156,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
-  if (Preds.getComplexity() > SCEVThreshold) {
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
     emitAnalysis(VectorizationReport()
                  << "Too many SCEV assumptions need to be made and checked "
                  << "at runtime");
@@ -4268,7 +4262,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
 
         InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
+        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
           Inductions[Phi] = ID;
           // Get the widest type.
           if (!WidestIndTy)
@@ -4337,7 +4331,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // second argument is the same (i.e. loop invariant)
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
-        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
           emitAnalysis(VectorizationReport(&*it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
@@ -4410,7 +4405,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
   if (!Stride)
     return;
 
@@ -4474,7 +4469,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  Preds.add(&LAI->Preds);
+  PSE.addPredicate(LAI->PSE.getUnionPredicate());
 
   return true;
 }
@@ -4589,7 +4584,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     StoreInst *SI = dyn_cast<StoreInst>(I);
 
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
+    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
 
     // The factor of the corresponding interleave group.
     unsigned Factor = std::abs(Stride);
@@ -4598,7 +4593,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
       continue;
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
@@ -4685,8 +4680,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
         continue;
 
       // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA =
-          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
       if (!DistToA)
         continue;
 
@@ -5188,7 +5183,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
         continue;
       }
 
-      // Count the number of live intevals.
+      // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals)
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index 78e65aee1460..aaafe07c1eb8 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -33,7 +33,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
   %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
 
 ; CHECK-LABEL: reduction_cost_int
-; CHECK:  cost of 23 {{.*}} extractelement
+; CHECK:  cost of 17 {{.*}} extractelement
 
   %r = extractelement <8 x i32> %bin.rdx.3, i32 0
   ret i32 %r
diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index 9913a4896912..9f0c4065c178 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll
@@ -248,13 +248,13 @@ define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) {
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
   ; AVX1-LABEL: sitofpv2i64v2double
-  ; AVX1: cost of 4 {{.*}} sitofp
+  ; AVX1: cost of 20 {{.*}} sitofp
   ;
   ; AVX2-LABEL: sitofpv2i64v2double
-  ; AVX2: cost of 4 {{.*}} sitofp
+  ; AVX2: cost of 20 {{.*}} sitofp
   ;
   ; AVX512F-LABEL: sitofpv2i64v2double
-  ; AVX512F: cost of 4 {{.*}} sitofp
+  ; AVX512F: cost of 20 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
diff --git a/test/Analysis/CostModel/X86/sse-itoi.ll b/test/Analysis/CostModel/X86/sse-itoi.ll
new file mode 100644
index 000000000000..6429e4fae097
--- /dev/null
+++ b/test/Analysis/CostModel/X86/sse-itoi.ll
@@ -0,0 +1,353 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s
+
+define void @zext_v16i16_to_v16i32(<16 x i16>* %a) {
+; SSE2: zext_v16i16_to_v16i32
+; SSE2: cost of 6 {{.*}} zext
+;
+; SSE41: zext_v16i16_to_v16i32
+; SSE41: cost of 4 {{.*}} zext
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = zext <16 x i16> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i16_to_v16i32(<16 x i16>* %a) {
+; SSE2: sext_v16i16_to_v16i32
+; SSE2: cost of 8 {{.*}} sext
+;
+; SSE41: sext_v16i16_to_v16i32
+; SSE41: cost of 4 {{.*}} sext
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = sext <16 x i16> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i16_to_v8i32(<8 x i16>* %a) {
+; SSE2: zext_v8i16_to_v8i32
+; SSE2: cost of 3 {{.*}} zext
+;
+; SSE41: zext_v8i16_to_v8i32
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = zext <8 x i16> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i16_to_v8i32(<8 x i16>* %a) {
+; SSE2: sext_v8i16_to_v8i32
+; SSE2: cost of 4 {{.*}} sext
+;
+; SSE41: sext_v8i16_to_v8i32
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = sext <8 x i16> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i16_to_v4i32(<4 x i16>* %a) {
+; SSE2: zext_v4i16_to_v4i32
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v4i16_to_v4i32
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i16_to_v4i32(<4 x i16>* %a) {
+; SSE2: sext_v4i16_to_v4i32
+; SSE2: cost of 2 {{.*}} sext
+;
+; SSE41: sext_v4i16_to_v4i32
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v16i8_to_v16i32(<16 x i8>* %a) {
+; SSE2: zext_v16i8_to_v16i32
+; SSE2: cost of 9 {{.*}} zext
+;
+; SSE41: zext_v16i8_to_v16i32
+; SSE41: cost of 4 {{.*}} zext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i8_to_v16i32(<16 x i8>* %a) {
+; SSE2: sext_v16i8_to_v16i32
+; SSE2: cost of 12 {{.*}} sext
+;
+; SSE41: sext_v16i8_to_v16i32
+; SSE41: cost of 4 {{.*}} sext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = sext <16 x i8> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i8_to_v8i32(<8 x i8>* %a) {
+; SSE2: zext_v8i8_to_v8i32
+; SSE2: cost of 6 {{.*}} zext
+;
+; SSE41: zext_v8i8_to_v8i32
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i8_to_v8i32(<8 x i8>* %a) {
+; SSE2: sext_v8i8_to_v8i32
+; SSE2: cost of 6 {{.*}} sext
+;
+; SSE41: sext_v8i8_to_v8i32
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = sext <8 x i8> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i8_to_v4i32(<4 x i8>* %a) {
+; SSE2: zext_v4i8_to_v4i32
+; SSE2: cost of 2 {{.*}} zext
+;
+; SSE41: zext_v4i8_to_v4i32
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i8_to_v4i32(<4 x i8>* %a) {
+; SSE2: sext_v4i8_to_v4i32
+; SSE2: cost of 3 {{.*}} sext
+;
+; SSE41: sext_v4i8_to_v4i32
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v16i8_to_v16i16(<16 x i8>* %a) {
+; SSE2: zext_v16i8_to_v16i16
+; SSE2: cost of 3 {{.*}} zext
+;
+; SSE41: zext_v16i8_to_v16i16
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i8_to_v16i16(<16 x i8>* %a) {
+; SSE2: sext_v16i8_to_v16i16
+; SSE2: cost of 4 {{.*}} sext
+;
+; SSE41: sext_v16i8_to_v16i16
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = sext <16 x i8> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i8_to_v8i16(<8 x i8>* %a) {
+; SSE2: zext_v8i8_to_v8i16
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v8i8_to_v8i16
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i8_to_v8i16(<8 x i8>* %a) {
+; SSE2: sext_v8i8_to_v8i16
+; SSE2: cost of 2 {{.*}} sext
+;
+; SSE41: sext_v8i8_to_v8i16
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i8_to_v4i16(<4 x i8>* %a) {
+; SSE2: zext_v4i8_to_v4i16
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v4i8_to_v4i16
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
+; SSE2: sext_v4i8_to_v4i16
+; SSE2: cost of 6 {{.*}} sext
+;
+; SSE41: sext_v4i8_to_v4i16
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = sext <4 x i8> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
+; SSE2: truncate_v16i32_to_v16i16
+; SSE2: cost of 14 {{.*}} trunc
+;
+; SSE41: truncate_v16i32_to_v16i16
+; SSE41: cost of 6 {{.*}} trunc
+;
+  %1 = load <16 x i32>, <16 x i32>* %a
+  %2 = trunc <16 x i32> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
+; SSE2: truncate_v8i32_to_v8i16
+; SSE2: cost of 7 {{.*}} trunc
+;
+; SSE41: truncate_v8i32_to_v8i16
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <8 x i32>, <8 x i32>* %a
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
+; SSE2: truncate_v4i32_to_v4i16
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v4i32_to_v4i16
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
+; SSE2: truncate_v16i32_to_v16i8
+; SSE2: cost of 31 {{.*}} trunc
+;
+; SSE41: truncate_v16i32_to_v16i8
+; SSE41: cost of 30 {{.*}} trunc
+;
+  %1 = load <16 x i32>, <16 x i32>* %a
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  store <16 x i8> %2, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) {
+; SSE2: truncate_v8i32_to_v8i8
+; SSE2: cost of 4 {{.*}} trunc
+;
+; SSE41: truncate_v8i32_to_v8i8
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <8 x i32>, <8 x i32>* %a
+  %2 = trunc <8 x i32> %1 to <8 x i8>
+  store <8 x i8> %2, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) {
+; SSE2: truncate_v4i32_to_v4i8
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v4i32_to_v4i8
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = trunc <4 x i32> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) {
+; SSE2: truncate_v16i16_to_v16i8
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v16i16_to_v16i8
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  store <16 x i8> %2, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) {
+; SSE2: truncate_v8i16_to_v8i8
+; SSE2: cost of 2 {{.*}} trunc
+;
+; SSE41: truncate_v8i16_to_v8i8
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  store <8 x i8> %2, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) {
+; SSE2: truncate_v4i16_to_v4i8
+; SSE2: cost of 4 {{.*}} trunc
+;
+; SSE41: truncate_v4i16_to_v4i8
+; SSE41: cost of 2 {{.*}} trunc
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = trunc <4 x i16> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* undef, align 4
+  ret void
+}
diff --git a/test/Assembler/debug-info.ll b/test/Assembler/debug-info.ll
index 91dfe561a2fe..86630840dc20 100644
--- a/test/Assembler/debug-info.ll
+++ b/test/Assembler/debug-info.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30}
+; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
 
 ; CHECK:      !0 = !DISubrange(count: 3)
 ; CHECK-NEXT: !1 = !DISubrange(count: 3, lowerBound: 4)
@@ -63,10 +63,19 @@
 !25 = !DICompositeType(tag: DW_TAG_structure_type)
 !26 = !DICompositeType(tag: DW_TAG_structure_type, runtimeLang: 6)
 
-; !25 = !{!7, !7}
-; !26 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !25)
-; !27 = !DISubroutineType(types: !25)
+; CHECK-NEXT: !25 = !{!6, !6}
+; CHECK-NEXT: !26 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !25)
+; CHECK-NEXT: !27 = !DISubroutineType(types: !25)
 !27 = !{!7, !7}
 !28 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !27)
 !29 = !DISubroutineType(flags: 0, types: !27)
 !30 = !DISubroutineType(types: !27)
+
+; CHECK-NEXT: !28 = !DIMacro(type: DW_MACINFO_define, line: 9, name: "Name", value: "Value")
+; CHECK-NEXT: !29 = distinct !{!28}
+; CHECK-NEXT: !30 = !DIMacroFile(line: 9, file: !12, nodes: !29)
+; CHECK-NEXT: !31 = !DIMacroFile(line: 11, file: !12)
+!31 = !DIMacro(type: DW_MACINFO_define, line: 9, name: "Name", value: "Value")
+!32 = distinct !{!31}
+!33 = !DIMacroFile(line: 9, file: !14, nodes: !32)
+!34 = !DIMacroFile(type: DW_MACINFO_start_file, line: 11, file: !14)
diff --git a/test/Assembler/dicompileunit.ll b/test/Assembler/dicompileunit.ll
index ba6731827cdb..92fa61fe6b90 100644
--- a/test/Assembler/dicompileunit.ll
+++ b/test/Assembler/dicompileunit.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
 
 !0 = distinct !{}
 !1 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
@@ -11,15 +11,16 @@
 !4 = distinct !{}
 !5 = distinct !{}
 !6 = distinct !{}
+!7 = distinct !{}
 
-; CHECK: !7 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, splitDebugFilename: "abc.debug", emissionKind: 3, enums: !2, retainedTypes: !3, subprograms: !4, globals: !5, imports: !6, dwoId: 42)
-!7 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
+; CHECK: !8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, splitDebugFilename: "abc.debug", emissionKind: 3, enums: !2, retainedTypes: !3, subprograms: !4, globals: !5, imports: !6, macros: !7, dwoId: 42)
+!8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
                              isOptimized: true, flags: "-O2", runtimeVersion: 2,
                              splitDebugFilename: "abc.debug", emissionKind: 3,
                              enums: !2, retainedTypes: !3, subprograms: !4,
-                             globals: !5, imports: !6, dwoId: 42)
+                             globals: !5, imports: !6, macros: !7, dwoId: 42)
 
-; CHECK: !8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: 0)
-!8 = distinct !DICompileUnit(language: 12, file: !1, producer: "",
+; CHECK: !9 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: 0)
+!9 = distinct !DICompileUnit(language: 12, file: !1, producer: "",
                              isOptimized: false, flags: "", runtimeVersion: 0,
                              splitDebugFilename: "", emissionKind: 0)
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index 545aeda88602..1bc2a3ccb1ca 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
-; RUN: llc -march=aarch64 -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
 
 ; NEON-LABEL: load_factor2:
 ; NEON: ld2 { v0.8b, v1.8b }, [x0]
@@ -232,3 +232,39 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
+; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
+; NEON-NEXT: ret
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr s0, [x0]
+; NONEON-NEXT: ldr s1, [x0, #8]
+; NONEON-NEXT: ret
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
+; NEON-NEXT: st1 { v0.d }[0], [x0]
+; NEON-NEXT: ret
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
+; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
+; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
+; NONEON-NEXT: str x[[RES]], [x0]
+; NONEON-NEXT: ret
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 0ef7b143df80..55c9c6036ed5 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -94,9 +94,7 @@ entry:
   store i32 %c, i32* %c.addr, align 4
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16, i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp w0, #0
-; CHECK: b.eq LBB4_2
+; CHECK: tbz w0, #0, LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
 
@@ -106,9 +104,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32, i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: cmp w[[REG]], #0
-; CHECK: b.eq LBB4_4
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
 
@@ -118,8 +114,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64, i64* %d.addr, align 8
-; CHECK: cmp w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
 
@@ -139,9 +134,7 @@ define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
 ; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
-; CHECK: cmp  [[REG3]], #0
-; CHECK: b.eq LBB5_2
+; CHECK: tbz w[[REG2]], #0, LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
   br i1 %b, label %if.then, label %if.else
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
new file mode 100644
index 000000000000..39f6c0fbec94
--- /dev/null
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; Shrink wrapping currently does not kick in because we have a TLS CALL
+; in the entry block and it will clobber the link register.
+
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
+declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg)
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle)
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW2sg
+; CHECK-DAG: stp d31, d30
+; CHECK-DAG: stp d29, d28
+; CHECK-DAG: stp d27, d26
+; CHECK-DAG: stp d25, d24
+; CHECK-DAG: stp d23, d22
+; CHECK-DAG: stp d21, d20
+; CHECK-DAG: stp d19, d18
+; CHECK-DAG: stp d17, d16
+; CHECK-DAG: stp d7, d6
+; CHECK-DAG: stp d5, d4
+; CHECK-DAG: stp d3, d2
+; CHECK-DAG: stp d1, d0
+; CHECK-DAG: stp x20, x19
+; CHECK-DAG: stp x14, x13
+; CHECK-DAG: stp x12, x11
+; CHECK-DAG: stp x10, x9
+; CHECK-DAG: stp x8, x7
+; CHECK-DAG: stp x6, x5
+; CHECK-DAG: stp x4, x3
+; CHECK-DAG: stp x2, x1
+; CHECK-DAG: stp x29, x30
+; CHECK: blr
+; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
+; CHECK: blr
+; CHECK: tlv_atexit
+; CHECK: [[BB_end]]:
+; CHECK: blr
+; CHECK-DAG: ldp x2, x1
+; CHECK-DAG: ldp x4, x3
+; CHECK-DAG: ldp x6, x5
+; CHECK-DAG: ldp x8, x7
+; CHECK-DAG: ldp x10, x9
+; CHECK-DAG: ldp x12, x11
+; CHECK-DAG: ldp x14, x13
+; CHECK-DAG: ldp x20, x19
+; CHECK-DAG: ldp d1, d0
+; CHECK-DAG: ldp d3, d2
+; CHECK-DAG: ldp d5, d4
+; CHECK-DAG: ldp d7, d6
+; CHECK-DAG: ldp d17, d16
+; CHECK-DAG: ldp d19, d18
+; CHECK-DAG: ldp d21, d20
+; CHECK-DAG: ldp d23, d22
+; CHECK-DAG: ldp d25, d24
+; CHECK-DAG: ldp d27, d26
+; CHECK-DAG: ldp d29, d28
+; CHECK-DAG: ldp d31, d30
diff --git a/test/CodeGen/AArch64/divrem.ll b/test/CodeGen/AArch64/divrem.ll
new file mode 100644
index 000000000000..9f648eb63eac
--- /dev/null
+++ b/test/CodeGen/AArch64/divrem.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
+; should not generate select error.
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+; CHECK-LABEL: test_udivrem
+; CHECK-DAG: udivrem
+; CHECK-NOT: LLVM ERROR: Cannot select
+  %div = udiv <2 x i32> %x, %y
+  store <2 x i32> %div, <2 x i32>* %z
+  %1 = urem <2 x i32> %x, %y
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @test_sdivrem(<4 x i32> %x,  <4 x i32>* %y) {
+; CHECK-LABEL: test_sdivrem
+; CHECK-DAG: sdivrem
+  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  store <4 x i32> %div, <4 x i32>* %y
+  %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
index c018b2778b04..55fbf63319ee 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -4,8 +4,7 @@ define void @test(i64 %a, i64 %b, i2* %c) {
 ; CHECK-LABEL: test
 ; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
 ; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
-; CHECK:       and [[REG2:w[0-9]+]], w8, #0x1
-; CHECK-NEXT:  cmp [[REG2]], #0
+; CHECK-NEXT:  tbz w9, #0,
  %1 = trunc i64 %a to i2
  %2 = trunc i64 %b to i1
 ; Force fast-isel to fall back to SDAG.
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
index 5248b9253e7a..e04a62b85c8e 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -44,9 +44,7 @@ bb4:
 ; CHECK-NEXT:  cmp   w1, #0
 ; CHECK-NEXT:  cset  w9, eq
 ; CHECK-NEXT:  orr   w8, w8, w9
-; CHECK-NEXT:  and   w8, w8, #0x1
-; CHECK-NEXT:  cmp   w8, #0
-; CHECK-NEXT:  b.ne 
+; CHECK-NEXT:  tbnz w8, #0,
 define i64 @test_or_unpredictable(i32 %a, i32 %b) {
 bb1:
   %0 = icmp eq i32 %a, 0
@@ -68,9 +66,7 @@ bb4:
 ; CHECK-NEXT:  cmp   w1, #0
 ; CHECK-NEXT:  cset  w9, ne
 ; CHECK-NEXT:  and   w8, w8, w9
-; CHECK-NEXT:  and   w8, w8, #0x1
-; CHECK-NEXT:  cmp   w8, #0
-; CHECK-NEXT:  b.eq 
+; CHECK-NEXT:  tbz w8, #0,
 define i64 @test_and_unpredictable(i32 %a, i32 %b) {
 bb1:
   %0 = icmp ne i32 %a, 0
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 3f70f0a7e9f9..c0fec4d171cd 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -16,3 +16,19 @@ entry:
   %vget_lane = extractelement <1 x i64> %4, i32 0
   ret i64 %vget_lane
 }
+
+; PR25763 - folding constant vector comparisons with sign-extended result
+define <8 x i16> @dotests_458() {
+; CHECK-LABEL: dotests_458
+; CHECK:       movi d0, #0x00000000ff0000
+; CHECK-NEXT:  sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:  ret
+entry:
+  %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> <i8 127, i8 38, i8 -1, i8 -128, i8 127, i8 0, i8 0, i8 0>, i1 false) #6
+  %vsra_n = lshr <8 x i8> %vclz_v.i, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %name_6 = or <8 x i8> %vsra_n, <i8 127, i8 -128, i8 -1, i8 67, i8 84, i8 127, i8 -1, i8 0>
+  %cmp.i603 = icmp slt <8 x i8> %name_6, <i8 -57, i8 -128, i8 127, i8 -128, i8 -1, i8 0, i8 -1, i8 -1>
+  %vmovl.i4.i = sext <8 x i1> %cmp.i603 to <8 x i16>
+  ret <8 x i16> %vmovl.i4.i
+}
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1)
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 0dbda152fca9..f6e4bdf73459 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -130,7 +130,6 @@ define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
   ret <4 x i16> %2
 }
 
-
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-LABEL: sitofp_i8:
 ; CHECK-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8
@@ -218,4 +217,54 @@ define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
   ret <4 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <4 x half> undef, half %a, i64 0
+  store <4 x half> %1, <4 x half>* %b, align 4
+  ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 10a8c22d6f7e..137d1f358a30 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -358,4 +358,67 @@ define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
   ret <8 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  store <8 x half> %1, <8 x half>* %b, align 4
+  ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
new file mode 100644
index 000000000000..b52a0a103130
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
+@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0
+; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0
+; GCN: .text
+; GCN: readonly:
+; GCN: readonly2:
+define void @main(i32 %index, float addrspace(1)* %out) {
+  %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index
+  %val = load float, float addrspace(2)* %ptr
+  store float %val, float addrspace(1)* %out
+  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index
+  %val2 = load float, float addrspace(2)* %ptr2
+  store float %val2, float addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 7a5a7346865f..5906b2f15709 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -19,7 +19,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -49,7 +49,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -66,7 +66,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].X
+; EG: MOV * [[VAL]], KC0[3].X
 define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
@@ -96,7 +96,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
@@ -113,7 +113,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
@@ -126,7 +126,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
@@ -145,7 +145,7 @@ entry:
 ;
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[4].Z
+; EG: MOV * [[VAL]], KC0[4].Z
 define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
new file mode 100644
index 000000000000..78868710c6a2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}inline_reg_constraints:
+; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+entry:
+  %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v128 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %s32 =  tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s64 =  tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s128 =  tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s256 =  tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index cff1c24f89d6..9d2320cb2d19 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -7,8 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: {{^}}i32_literal:
-; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,8 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: {{^}}float_literal:
-; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
index 6dc9d050eee6..2e299e30b8c7 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}read_workdim:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
index 74792e50017f..a30a8e083eb6 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: {{^}}amdgpu_trunc:
 ; SI: v_trunc_f32
 
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index f2a7256e812d..13ebee41e844 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -5,7 +5,7 @@
 
 ; FUNC-LABEL: {{^}}local_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
+; EG: MOV * [[VAL]], KC0[1].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
@@ -23,7 +23,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
+; EG: MOV * [[VAL]], KC0[1].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
+; EG: MOV * [[VAL]], KC0[2].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index 1c04090b407f..e40f18f040b7 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 }
 
 ; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 53694dcffa66..57365a6e1fc3 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -5,8 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -31,8 +31,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -45,8 +45,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -57,8 +57,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -71,8 +71,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -83,8 +83,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -97,8 +97,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -109,8 +109,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -123,8 +123,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -135,8 +135,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -149,8 +149,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 95fcfdbdecae..23ae3b967971 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
-; EG-NEXT: LSHR * [[ADDR]]
+; EG: LSHR * [[ADDR]]
+; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index bf08e66f3304..55db80731c90 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG: {{^}}shl_i64:
+;EG-LABEL: {{^}}shl_i64:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
@@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG: {{^}}shl_v2i64:
+;EG-LABEL: {{^}}shl_v2i64:
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index bcbc32f4c053..3b59bbfb18c0 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -70,11 +70,11 @@ entry:
 ;EG-LABEL: {{^}}ashr_i64_2:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index ebb2f2db252e..bbd954356322 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index 8ab4faf2f145..d120111a71fb 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -3,8 +3,8 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK-LABEL: {{^}}slt:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -40,8 +40,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -52,8 +52,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}olt:
-; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -64,8 +64,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}sle:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -76,8 +76,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -113,8 +113,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ole:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index a704a23b0f92..f420ec9c7d23 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
 
 ; HSA: .amd_kernel_code_t
 
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
@@ -53,7 +53,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Z
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
@@ -68,7 +68,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].W
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
@@ -98,7 +98,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index ddb920af29d8..655655d92f08 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 }
 
 ; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
diff --git a/test/CodeGen/ARM/arm-interleaved-accesses.ll b/test/CodeGen/ARM/arm-interleaved-accesses.ll
index c3aa2d6b4da2..002e71f6d9b8 100644
--- a/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -264,3 +264,43 @@ define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspa
  store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
  ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vld1.64 {d16, d17}, [r0:128]
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vmov r0, r1, d16
+; NEON-NEXT: vmov r2, r3, {{.*}}
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0]
+; NONEON-NEXT: ldr r1, [r0, #8]
+; NONEON-NEXT: mov r0, [[ELT0]]
+; NONEON-NEXT: mov pc, lr
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; This lowering isn't great, but it's at least correct.
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vldr d17, [sp]
+; NEON-NEXT: vmov d16, r2, r3
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vstr d16, [r0]
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: stm r0, {r1, r3}
+; NONEON-NEXT: mov pc, lr
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM/build-attributes-optimization-minsize.ll b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
new file mode 100644
index 000000000000..4cfb6012f439
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 4	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Aggressive Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-mixed.ll b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
new file mode 100644
index 000000000000..8009fc6e28f8
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+
+; CHECK-NOT: .eabi_attribute 30
+; CHECK-NOT: Tag_ABI_optimization_goals
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+define i32 @g(i64 %z) #1 {
+    ret i32 1
+}
+
+attributes #0 = { noinline optnone }
+
+attributes #1 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optnone.ll b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
new file mode 100644
index 000000000000..cbdb915045c6
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 6	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Best Debugging
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { noinline optnone }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optsize.ll b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
new file mode 100644
index 000000000000..bab210aa8d01
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 3	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization.ll b/test/CodeGen/ARM/build-attributes-optimization.ll
new file mode 100644
index 000000000000..21b7b3c3ab0c
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s --check-prefix=NONE
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s --check-prefix=SPEED
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s --check-prefix=MAXSPEED
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=NONE-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=SPEED-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=MAXSPEED-OBJ
+
+; NONE:     .eabi_attribute 30, 5	@ Tag_ABI_optimization_goals
+; SPEED:    .eabi_attribute 30, 1	@ Tag_ABI_optimization_goals
+; MAXSPEED: .eabi_attribute 30, 2	@ Tag_ABI_optimization_goals
+
+; NONE-OBJ:          TagName: ABI_optimization_goals
+; NONE-OBJ-NEXT:     Description: Debugging
+; SPEED-OBJ:         TagName: ABI_optimization_goals
+; SPEED-OBJ-NEXT:    Description: Speed
+; MAXSPEED-OBJ:      TagName: ABI_optimization_goals
+; MAXSPEED-OBJ-NEXT: Description: Aggressive Speed
+
+define i32 @f(i64 %z) {
+    ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index b80191d76012..bf502b3ae077 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -1105,7 +1105,7 @@
 ; CORTEX-R7:  .eabi_attribute 25, 1
 ; CORTEX-R7:  .eabi_attribute 27, 1
 ; CORTEX-R7-NOT:  .eabi_attribute 28
-; CORTEX-R7-NOT:  .eabi_attribute 36
+; CORTEX-R7:  .eabi_attribute 36, 1
 ; CORTEX-R7:  .eabi_attribute 38, 1
 ; CORTEX-R7:  .eabi_attribute 42, 1
 ; CORTEX-R7:  .eabi_attribute 44, 2
diff --git a/test/CodeGen/ARM/combine-vmovdrr.ll b/test/CodeGen/ARM/combine-vmovdrr.ll
new file mode 100644
index 000000000000..358f7e3a983e
--- /dev/null
+++ b/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -0,0 +1,72 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target triple = "thumbv7s-apple-ios"
+
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i)
+
+; Check that we get the motivating example:
+; The bitcasts force the values to go through the GPRs, whereas
+; they are defined on VPRs and used on VPRs.
+;
+; CHECK-LABEL: motivatingExample:
+; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: bx lr
+define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation for dynamic index.
+; CHECK-LABEL: dynamicIndex:
+; CHECK-NOT: mul
+; CHECK: pop
+define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation when there are several uses
+; of the result of the bitcast.
+; CHECK-LABEL: severalUses:
+; ARG1_VALlo is hard coded because we need to access the high part of d0,
+; i.e., s1, and we can't express that with filecheck.
+; CHECK: vld1.32 {[[ARG1_VALlo:d0]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; s1 is actually 2 * ARG1_VALlo + 1, but we cannot express that with filecheck.
+; CHECK-NEXT: vmov [[REThi:r[0-9]+]], s1
+; We build the return value here. s0 is 2 * ARG1_VALlo.
+; CHECK-NEXT: vmov r0, s0
+; This copy is correct but actually useless. We should be able to clean it up.
+; CHECK-NEXT: vmov [[ARG1_VALloCPY:d[0-9]+]], r0, [[REThi]]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALloCPY]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: mov r1, [[REThi]]
+; CHECK-NEXT: bx lr
+define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret i64 %shuffle.i.extract.i310
+}
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index 3baa103e3d5d..75a90bbf0caa 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -63,7 +63,7 @@ define i32 @f8() nounwind {
                                     float 3.000000e+00> }, align 16
 ; CHECK: const1
 ; CHECK: .zero 16
-; CHECK: float 1.0
-; CHECK: float 2.0
-; CHECK: float 3.0
+; CHECK: float 1
+; CHECK: float 2
+; CHECK: float 3
 ; CHECK: .zero 4
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index 33d389698995..4bd401b68496 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -30,11 +30,11 @@
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -disable-fp-elim -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP-ELIM
 
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index 11a8ee6ef7de..5d44eb0f11d1 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -224,3 +224,18 @@ entry:
   %conv = zext i8 %0 to i32
   ret i32 %conv
 }
+
+
+define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
+; CHECK-LABEL: test_well_formed_dag:
+; CHECK-ARM: movw [[SMALL_CONST:r[0-9]+]], #675
+; CHECK-ARM: mul [[SMALL_PROD:r[0-9]+]], r0, [[SMALL_CONST]]
+; CHECK-ARM: add {{r[0-9]+}}, r1, [[SMALL_PROD]], lsl #7
+
+  %mul.small = mul i32 %in1, 675
+  store i32 %mul.small, i32* %addr
+  %mul.big = mul i32 %in1, 86400
+  %add = add i32 %in2, %mul.big
+  store i32 %add, i32* %addr
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/NVJumpCmp.ll b/test/CodeGen/Hexagon/NVJumpCmp.ll
new file mode 100644
index 000000000000..6b160d962ebb
--- /dev/null
+++ b/test/CodeGen/Hexagon/NVJumpCmp.ll
@@ -0,0 +1,89 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; Look for an instruction, we really just do not want to see an abort.
+; CHECK: trace_event
+; REQUIRES: asserts
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+; Function Attrs: nounwind
+define void @_ZN6Halide7Runtime8Internal13default_traceEPvPK18halide_trace_event() #0 {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then
+  br i1 undef, label %while.cond, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %add = add i32 undef, 48
+  br i1 undef, label %if.end, label %if.then17
+
+if.then17:                                        ; preds = %while.end
+  unreachable
+
+if.end:                                           ; preds = %while.end
+  %arrayidx21 = getelementptr inbounds [4096 x i8], [4096 x i8]* undef, i32 0, i32 8
+  store i8 undef, i8* %arrayidx21, align 4, !tbaa !1
+  br i1 undef, label %for.body42.preheader6, label %min.iters.checked
+
+for.body42.preheader6:                            ; preds = %vector.body.preheader, %min.iters.checked, %if.end
+  unreachable
+
+min.iters.checked:                                ; preds = %if.end
+  br i1 undef, label %for.body42.preheader6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %min.iters.checked
+  br i1 undef, label %for.cond48.preheader, label %for.body42.preheader6
+
+for.cond48.preheader:                             ; preds = %vector.body.preheader
+  br i1 undef, label %while.cond.i, label %for.body61.lr.ph
+
+for.body61.lr.ph:                                 ; preds = %for.cond48.preheader
+  br i1 undef, label %for.body61, label %min.iters.checked595
+
+min.iters.checked595:                             ; preds = %for.body61.lr.ph
+  br i1 undef, label %for.body61, label %vector.memcheck608
+
+vector.memcheck608:                               ; preds = %min.iters.checked595
+  %scevgep600 = getelementptr [4096 x i8], [4096 x i8]* undef, i32 0, i32 %add
+  %bound0604 = icmp ule i8* %scevgep600, undef
+  %memcheck.conflict607 = and i1 undef, %bound0604
+  br i1 %memcheck.conflict607, label %for.body61, label %vector.body590
+
+vector.body590:                                   ; preds = %vector.body590, %vector.memcheck608
+  br i1 undef, label %middle.block591, label %vector.body590, !llvm.loop !4
+
+middle.block591:                                  ; preds = %vector.body590
+  %cmp.n613 = icmp eq i32 undef, 0
+  br i1 %cmp.n613, label %while.cond.i, label %for.body61
+
+while.cond.i:                                     ; preds = %for.body61, %while.cond.i, %middle.block591, %for.cond48.preheader
+  br i1 undef, label %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit, label %while.cond.i
+
+_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit: ; preds = %while.cond.i
+  unreachable
+
+for.body61:                                       ; preds = %for.body61, %middle.block591, %vector.memcheck608, %min.iters.checked595, %for.body61.lr.ph
+  %cmp59 = icmp ult i32 undef, undef
+  br i1 %cmp59, label %for.body61, label %while.cond.i, !llvm.loop !7
+
+if.else:                                          ; preds = %entry
+  unreachable
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = distinct !{!7, !5, !6}
diff --git a/test/CodeGen/Hexagon/v60Vasr.ll b/test/CodeGen/Hexagon/v60Vasr.ll
new file mode 100644
index 000000000000..fb177f614f72
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60Vasr.ll
@@ -0,0 +1,247 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; CHECK: vasr(v{{[0-9]+}}.h,v{{[0-9]+}}.h,r{{[0-7]+}}):sat
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
+
+; Function Attrs: norecurse nounwind
+define i32 @__test_vasr(%struct.buffer_t* noalias nocapture %f.buffer, %struct.buffer_t* noalias nocapture %g.buffer, %struct.buffer_t* noalias nocapture %res.buffer) #0 {
+entry:
+  %buf_host = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 1
+  %f.host = load i8*, i8** %buf_host, align 4
+  %buf_dev = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 0
+  %f.dev = load i64, i64* %buf_dev, align 8
+  %0 = icmp eq i8* %f.host, null
+  %1 = icmp eq i64 %f.dev, 0
+  %f.host_and_dev_are_null = and i1 %0, %1
+  %buf_min = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 0
+  %f.min.0 = load i32, i32* %buf_min, align 4
+  %buf_host10 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 1
+  %g.host = load i8*, i8** %buf_host10, align 4
+  %buf_dev11 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 0
+  %g.dev = load i64, i64* %buf_dev11, align 8
+  %2 = icmp eq i8* %g.host, null
+  %3 = icmp eq i64 %g.dev, 0
+  %g.host_and_dev_are_null = and i1 %2, %3
+  %buf_min22 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 0
+  %g.min.0 = load i32, i32* %buf_min22, align 4
+  %buf_host27 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 1
+  %res.host = load i8*, i8** %buf_host27, align 4
+  %buf_dev28 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 0
+  %res.dev = load i64, i64* %buf_dev28, align 8
+  %4 = icmp eq i8* %res.host, null
+  %5 = icmp eq i64 %res.dev, 0
+  %res.host_and_dev_are_null = and i1 %4, %5
+  %buf_extent31 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 0
+  %res.extent.0 = load i32, i32* %buf_extent31, align 4
+  %buf_min39 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 0
+  %res.min.0 = load i32, i32* %buf_min39, align 4
+  %6 = add nsw i32 %res.extent.0, -1
+  %7 = and i32 %6, -64
+  %8 = add i32 %res.min.0, 63
+  %9 = add i32 %8, %7
+  %10 = add nsw i32 %res.min.0, %res.extent.0
+  %11 = add nsw i32 %10, -1
+  %12 = icmp slt i32 %9, %11
+  %13 = select i1 %12, i32 %9, i32 %11
+  %14 = add nsw i32 %10, -64
+  %15 = icmp slt i32 %res.min.0, %14
+  %16 = select i1 %15, i32 %res.min.0, i32 %14
+  %f.extent.0.required.s = sub nsw i32 %13, %16
+  br i1 %f.host_and_dev_are_null, label %true_bb, label %after_bb
+
+true_bb:                                          ; preds = %entry
+  %buf_elem_size44 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size44, align 4
+  store i32 %16, i32* %buf_min, align 4
+  %17 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent46 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 0
+  store i32 %17, i32* %buf_extent46, align 4
+  %buf_stride47 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride47, align 4
+  %buf_min48 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min48, align 4
+  %buf_extent49 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent49, align 4
+  %buf_stride50 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride50, align 4
+  %buf_min51 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min51, align 4
+  %buf_extent52 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent52, align 4
+  %buf_stride53 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride53, align 4
+  %buf_min54 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min54, align 4
+  %buf_extent55 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent55, align 4
+  %buf_stride56 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride56, align 4
+  br label %after_bb
+
+after_bb:                                         ; preds = %true_bb, %entry
+  br i1 %g.host_and_dev_are_null, label %true_bb57, label %after_bb59
+
+true_bb57:                                        ; preds = %after_bb
+  %buf_elem_size60 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size60, align 4
+  store i32 %16, i32* %buf_min22, align 4
+  %18 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent62 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 0
+  store i32 %18, i32* %buf_extent62, align 4
+  %buf_stride63 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride63, align 4
+  %buf_min64 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min64, align 4
+  %buf_extent65 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent65, align 4
+  %buf_stride66 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride66, align 4
+  %buf_min67 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min67, align 4
+  %buf_extent68 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent68, align 4
+  %buf_stride69 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride69, align 4
+  %buf_min70 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min70, align 4
+  %buf_extent71 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent71, align 4
+  %buf_stride72 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride72, align 4
+  br label %after_bb59
+
+after_bb59:                                       ; preds = %true_bb57, %after_bb
+  br i1 %res.host_and_dev_are_null, label %after_bb75.thread, label %after_bb75
+
+after_bb75.thread:                                ; preds = %after_bb59
+  %buf_elem_size76 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size76, align 4
+  store i32 %16, i32* %buf_min39, align 4
+  %19 = add nsw i32 %f.extent.0.required.s, 1
+  store i32 %19, i32* %buf_extent31, align 4
+  %buf_stride79 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride79, align 4
+  %buf_min80 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min80, align 4
+  %buf_extent81 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent81, align 4
+  %buf_stride82 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride82, align 4
+  %buf_min83 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min83, align 4
+  %buf_extent84 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent84, align 4
+  %buf_stride85 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride85, align 4
+  %buf_min86 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min86, align 4
+  %buf_extent87 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent87, align 4
+  %buf_stride88 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride88, align 4
+  br label %destructor_block
+
+after_bb75:                                       ; preds = %after_bb59
+  %20 = or i1 %f.host_and_dev_are_null, %g.host_and_dev_are_null
+  br i1 %20, label %destructor_block, label %"produce res"
+
+"produce res":                                    ; preds = %after_bb75
+  %21 = ashr i32 %res.extent.0, 6
+  %22 = icmp sgt i32 %21, 0
+  br i1 %22, label %"for res.s0.x.x", label %"end for res.s0.x.x", !prof !4
+
+"for res.s0.x.x":                                 ; preds = %"for res.s0.x.x", %"produce res"
+  %res.s0.x.x = phi i32 [ %41, %"for res.s0.x.x" ], [ 0, %"produce res" ]
+  %23 = shl nsw i32 %res.s0.x.x, 6
+  %24 = add nsw i32 %23, %res.min.0
+  %25 = sub nsw i32 %24, %f.min.0
+  %26 = getelementptr inbounds i8, i8* %f.host, i32 %25
+  %27 = bitcast i8* %26 to <16 x i32>*
+  %28 = load <16 x i32>, <16 x i32>* %27, align 1, !tbaa !5
+  %29 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %28)
+  %30 = sub nsw i32 %24, %g.min.0
+  %31 = getelementptr inbounds i8, i8* %g.host, i32 %30
+  %32 = bitcast i8* %31 to <16 x i32>*
+  %33 = load <16 x i32>, <16 x i32>* %32, align 1, !tbaa !8
+  %34 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %33)
+  %35 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %29, <32 x i32> %34)
+  %36 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %35)
+  %37 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %35)
+  %38 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %36, <16 x i32> %37, i32 4)
+  %39 = getelementptr inbounds i8, i8* %res.host, i32 %23
+  %40 = bitcast i8* %39 to <16 x i32>*
+  store <16 x i32> %38, <16 x i32>* %40, align 1, !tbaa !10
+  %41 = add nuw nsw i32 %res.s0.x.x, 1
+  %42 = icmp eq i32 %41, %21
+  br i1 %42, label %"end for res.s0.x.x", label %"for res.s0.x.x"
+
+"end for res.s0.x.x":                             ; preds = %"for res.s0.x.x", %"produce res"
+  %43 = add nsw i32 %res.extent.0, 63
+  %44 = ashr i32 %43, 6
+  %45 = icmp sgt i32 %44, %21
+  br i1 %45, label %"for res.s0.x.x92.preheader", label %destructor_block, !prof !4
+
+"for res.s0.x.x92.preheader":                     ; preds = %"end for res.s0.x.x"
+  %46 = sub i32 -64, %f.min.0
+  %47 = add i32 %46, %10
+  %48 = getelementptr inbounds i8, i8* %f.host, i32 %47
+  %49 = bitcast i8* %48 to <16 x i32>*
+  %50 = load <16 x i32>, <16 x i32>* %49, align 1
+  %51 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %50)
+  %52 = sub i32 -64, %g.min.0
+  %53 = add i32 %52, %10
+  %54 = getelementptr inbounds i8, i8* %g.host, i32 %53
+  %55 = bitcast i8* %54 to <16 x i32>*
+  %56 = load <16 x i32>, <16 x i32>* %55, align 1
+  %57 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %56)
+  %58 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %51, <32 x i32> %57)
+  %59 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %58)
+  %60 = add nsw i32 %res.extent.0, -64
+  %61 = getelementptr inbounds i8, i8* %res.host, i32 %60
+  %62 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %58)
+  %63 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %62, <16 x i32> %59, i32 4)
+  %64 = bitcast i8* %61 to <16 x i32>*
+  store <16 x i32> %63, <16 x i32>* %64, align 1, !tbaa !10
+  br label %destructor_block
+
+destructor_block:                                 ; preds = %"for res.s0.x.x92.preheader", %"end for res.s0.x.x", %after_bb75, %after_bb75.thread
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!llvm.module.flags = !{!1, !2, !3}
+
+!0 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.8.0)"}
+!1 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
+!2 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
+!3 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!4 = !{!"branch_weights", i32 1073741824, i32 0}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"f", !7}
+!7 = !{!"Halide buffer"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"g", !7}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"res", !7}
diff --git a/test/CodeGen/Mips/sitofp-selectcc-opt.ll b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
index c60fceb1a04c..751fba46d72f 100644
--- a/test/CodeGen/Mips/sitofp-selectcc-opt.ll
+++ b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
@@ -7,7 +7,7 @@ entry:
 ; check that this transformation doesn't happen:
 ; (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
 ;
-; CHECK-NOT:   # double -1.000000e+00
+; CHECK-NOT:   # double -1
 
   %tobool1 = icmp ne i32 %a, 0
   %not.tobool = icmp ne i64 %b, 0
@@ -19,4 +19,3 @@ entry:
   store double %add, double* @foo12.d4, align 8
   ret double %add
 }
-
diff --git a/test/CodeGen/PowerPC/BoolRetToIntTest.ll b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
new file mode 100644
index 000000000000..a7b79789b4ca
--- /dev/null
+++ b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
@@ -0,0 +1,203 @@
+; RUN: opt -bool-ret-to-int -S -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-LABEL: notBoolRet
+define signext i32 @notBoolRet() {
+entry:
+; CHECK: ret i32 1
+  ret i32 1
+}
+
+; CHECK-LABEL: find
+define zeroext i1 @find(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: retFalse
+define zeroext i1 @retFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 false
+}
+
+; CHECK-LABEL: retCvtFalse
+define zeroext i1 @retCvtFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 trunc(i32 0 to i1)
+}
+
+; CHECK-LABEL: find_cont
+define void @find_cont(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+  ret void
+}
+
+; CHECK-LABEL: find_cont_ret
+define zeroext i1 @find_cont_ret(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: arg_operand
+define zeroext i1 @arg_operand(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo ], [ %operand, %entry ]
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use
+define zeroext i1 @bad_use(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo], [ true, %entry ]
+  %0 = icmp eq i1 %result, %operand
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use_closure
+define zeroext i1 @bad_use_closure(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  %0 = icmp eq i1 %bar, %operand
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1 [ true
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ true, %entry ], [ %bar, %foo]
+  ret i1 %result
+}
+
+; CHECK-LABEL: arg_test
+define zeroext i1 @arg_test(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  br label %cleanup
+
+; CHECK-LABEL: cleanup
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ %bar, %foo], [ %operand, %entry ]
+  ret i1 %result
+}
+
+declare zeroext i1 @return_i1()
+
+; CHECK-LABEL: call_test
+define zeroext i1 @call_test() {
+; CHECK: [[REG:%.+]] = call i1
+  %result = call i1 @return_i1()
+; CHECK: ret i1 [[REG]]
+  ret i1 %result
+}
\ No newline at end of file
diff --git a/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
new file mode 100644
index 000000000000..fd2ba4ec635e
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
@@ -0,0 +1,89 @@
+; RUN: llc -enable-unsafe-fp-math < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-NOT: {{vmrg[hl]w.*(3[23456789]|[456][0-9])}}
+define void @__f0() {
+entry:
+  %0 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = shufflevector <16 x float> %0, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = fmul <32 x float> %1, %3
+  %5 = load <4 x float>, <4 x float>* undef, align 128
+  %6 = load <4 x float>, <4 x float>* undef, align 128
+  %7 = shufflevector <4 x float> undef, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %8 = shufflevector <4 x float> undef, <4 x float> %6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %9 = shufflevector <8 x float> %7, <8 x float> %8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %10 = shufflevector <16 x float> undef, <16 x float> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = load <4 x float>, <4 x float>* null, align 128
+  %12 = load <4 x float>, <4 x float>* undef, align 128
+  %13 = shufflevector <4 x float> undef, <4 x float> %11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %14 = shufflevector <4 x float> undef, <4 x float> %12, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %15 = shufflevector <8 x float> %13, <8 x float> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %16 = shufflevector <16 x float> undef, <16 x float> %15, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %17 = fmul <32 x float> %10, %16
+  %18 = fsub <32 x float> %4, %17
+  %19 = shufflevector <32 x float> %18, <32 x float> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %20 = bitcast <64 x float> %19 to <32 x double>
+  %21 = shufflevector <32 x double> undef, <32 x double> %20, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %22 = bitcast <64 x double> %21 to <128 x float>
+  %23 = shufflevector <128 x float> undef, <128 x float> %22, <256 x i32> <i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255>
+  %24 = shufflevector <256 x float> undef, <256 x float> %23, <512 x i32> <i32 0, i32 256, i32 1, i32 257, i32 2, i32 258, i32 3, i32 259, i32 4, i32 260, i32 5, i32 261, i32 6, i32 262, i32 7, i32 263, i32 8, i32 264, i32 9, i32 265, i32 10, i32 266, i32 11, i32 267, i32 12, i32 268, i32 13, i32 269, i32 14, i32 270, i32 15, i32 271, i32 16, i32 272, i32 17, i32 273, i32 18, i32 274, i32 19, i32 275, i32 20, i32 276, i32 21, i32 277, i32 22, i32 278, i32 23, i32 279, i32 24, i32 280, i32 25, i32 281, i32 26, i32 282, i32 27, i32 283, i32 28, i32 284, i32 29, i32 285, i32 30, i32 286, i32 31, i32 287, i32 32, i32 288, i32 33, i32 289, i32 34, i32 290, i32 35, i32 291, i32 36, i32 292, i32 37, i32 293, i32 38, i32 294, i32 39, i32 295, i32 40, i32 296, i32 41, i32 297, i32 42, i32 298, i32 43, i32 299, i32 44, i32 300, i32 45, i32 301, i32 46, i32 302, i32 47, i32 303, i32 48, i32 304, i32 49, i32 305, i32 50, i32 306, i32 51, i32 307, i32 52, i32 308, i32 53, i32 309, i32 54, i32 310, i32 55, i32 311, i32 56, i32 312, i32 57, i32 313, i32 58, i32 314, i32 59, i32 315, i32 60, i32 316, i32 61, i32 317, i32 62, i32 318, i32 63, i32 319, i32 64, i32 320, i32 65, i32 321, i32 66, i32 322, i32 67, i32 323, i32 68, i32 324, i32 69, i32 325, i32 70, i32 326, i32 71, i32 327, i32 72, i32 328, i32 73, i32 329, i32 74, i32 330, i32 75, i32 331, i32 76, i32 332, i32 77, i32 333, i32 78, i32 334, i32 79, i32 335, i32 80, i32 336, i32 81, i32 337, i32 82, i32 338, i32 83, i32 339, i32 84, i32 340, i32 85, i32 341, i32 86, i32 342, i32 87, i32 343, i32 88, i32 344, i32 89, i32 345, i32 90, i32 346, i32 91, i32 347, i32 92, i32 348, i32 93, i32 349, i32 94, i32 350, i32 95, i32 351, i32 96, i32 352, i32 97, i32 353, i32 98, i32 354, i32 99, i32 355, i32 100, i32 356, i32 101, i32 357, i32 102, i32 358, i32 103, i32 359, i32 104, i32 360, i32 105, i32 361, i32 106, i32 362, i32 107, i32 363, i32 108, i32 364, i32 109, i32 365, i32 110, i32 366, i32 111, i32 367, i32 112, i32 368, i32 113, i32 369, i32 114, i32 370, i32 115, i32 371, i32 116, i32 372, i32 117, i32 373, i32 118, i32 374, i32 119, i32 375, i32 120, i32 376, i32 121, i32 377, i32 122, i32 378, i32 123, i32 379, i32 124, i32 380, i32 125, i32 381, i32 126, i32 382, i32 127, i32 383, i32 128, i32 384, i32 129, i32 385, i32 130, i32 386, i32 131, i32 387, i32 132, i32 388, i32 133, i32 389, i32 134, i32 390, i32 135, i32 391, i32 136, i32 392, i32 137, i32 393, i32 138, i32 394, i32 139, i32 395, i32 140, i32 396, i32 141, i32 397, i32 142, i32 398, i32 143, i32 399, i32 144, i32 400, i32 145, i32 401, i32 146, i32 402, i32 147, i32 403, i32 148, i32 404, i32 149, i32 405, i32 150, i32 406, i32 151, i32 407, i32 152, i32 408, i32 153, i32 409, i32 154, i32 410, i32 155, i32 411, i32 156, i32 412, i32 157, i32 413, i32 158, i32 414, i32 159, i32 415, i32 160, i32 416, i32 161, i32 417, i32 162, i32 418, i32 163, i32 419, i32 164, i32 420, i32 165, i32 421, i32 166, i32 422, i32 167, i32 423, i32 168, i32 424, i32 169, i32 425, i32 170, i32 426, i32 171, i32 427, i32 172, i32 428, i32 173, i32 429, i32 174, i32 430, i32 175, i32 431, i32 176, i32 432, i32 177, i32 433, i32 178, i32 434, i32 179, i32 435, i32 180, i32 436, i32 181, i32 437, i32 182, i32 438, i32 183, i32 439, i32 184, i32 440, i32 185, i32 441, i32 186, i32 442, i32 187, i32 443, i32 188, i32 444, i32 189, i32 445, i32 190, i32 446, i32 191, i32 447, i32 192, i32 448, i32 193, i32 449, i32 194, i32 450, i32 195, i32 451, i32 196, i32 452, i32 197, i32 453, i32 198, i32 454, i32 199, i32 455, i32 200, i32 456, i32 201, i32 457, i32 202, i32 458, i32 203, i32 459, i32 204, i32 460, i32 205, i32 461, i32 206, i32 462, i32 207, i32 463, i32 208, i32 464, i32 209, i32 465, i32 210, i32 466, i32 211, i32 467, i32 212, i32 468, i32 213, i32 469, i32 214, i32 470, i32 215, i32 471, i32 216, i32 472, i32 217, i32 473, i32 218, i32 474, i32 219, i32 475, i32 220, i32 476, i32 221, i32 477, i32 222, i32 478, i32 223, i32 479, i32 224, i32 480, i32 225, i32 481, i32 226, i32 482, i32 227, i32 483, i32 228, i32 484, i32 229, i32 485, i32 230, i32 486, i32 231, i32 487, i32 232, i32 488, i32 233, i32 489, i32 234, i32 490, i32 235, i32 491, i32 236, i32 492, i32 237, i32 493, i32 238, i32 494, i32 239, i32 495, i32 240, i32 496, i32 241, i32 497, i32 242, i32 498, i32 243, i32 499, i32 244, i32 500, i32 245, i32 501, i32 246, i32 502, i32 247, i32 503, i32 248, i32 504, i32 249, i32 505, i32 250, i32 506, i32 251, i32 507, i32 252, i32 508, i32 253, i32 509, i32 254, i32 510, i32 255, i32 511>
+  %25 = shufflevector <512 x float> %24, <512 x float> undef, <1024 x i32> <i32 0, i32 512, i32 1, i32 513, i32 2, i32 514, i32 3, i32 515, i32 4, i32 516, i32 5, i32 517, i32 6, i32 518, i32 7, i32 519, i32 8, i32 520, i32 9, i32 521, i32 10, i32 522, i32 11, i32 523, i32 12, i32 524, i32 13, i32 525, i32 14, i32 526, i32 15, i32 527, i32 16, i32 528, i32 17, i32 529, i32 18, i32 530, i32 19, i32 531, i32 20, i32 532, i32 21, i32 533, i32 22, i32 534, i32 23, i32 535, i32 24, i32 536, i32 25, i32 537, i32 26, i32 538, i32 27, i32 539, i32 28, i32 540, i32 29, i32 541, i32 30, i32 542, i32 31, i32 543, i32 32, i32 544, i32 33, i32 545, i32 34, i32 546, i32 35, i32 547, i32 36, i32 548, i32 37, i32 549, i32 38, i32 550, i32 39, i32 551, i32 40, i32 552, i32 41, i32 553, i32 42, i32 554, i32 43, i32 555, i32 44, i32 556, i32 45, i32 557, i32 46, i32 558, i32 47, i32 559, i32 48, i32 560, i32 49, i32 561, i32 50, i32 562, i32 51, i32 563, i32 52, i32 564, i32 53, i32 565, i32 54, i32 566, i32 55, i32 567, i32 56, i32 568, i32 57, i32 569, i32 58, i32 570, i32 59, i32 571, i32 60, i32 572, i32 61, i32 573, i32 62, i32 574, i32 63, i32 575, i32 64, i32 576, i32 65, i32 577, i32 66, i32 578, i32 67, i32 579, i32 68, i32 580, i32 69, i32 581, i32 70, i32 582, i32 71, i32 583, i32 72, i32 584, i32 73, i32 585, i32 74, i32 586, i32 75, i32 587, i32 76, i32 588, i32 77, i32 589, i32 78, i32 590, i32 79, i32 591, i32 80, i32 592, i32 81, i32 593, i32 82, i32 594, i32 83, i32 595, i32 84, i32 596, i32 85, i32 597, i32 86, i32 598, i32 87, i32 599, i32 88, i32 600, i32 89, i32 601, i32 90, i32 602, i32 91, i32 603, i32 92, i32 604, i32 93, i32 605, i32 94, i32 606, i32 95, i32 607, i32 96, i32 608, i32 97, i32 609, i32 98, i32 610, i32 99, i32 611, i32 100, i32 612, i32 101, i32 613, i32 102, i32 614, i32 103, i32 615, i32 104, i32 616, i32 105, i32 617, i32 106, i32 618, i32 107, i32 619, i32 108, i32 620, i32 109, i32 621, i32 110, i32 622, i32 111, i32 623, i32 112, i32 624, i32 113, i32 625, i32 114, i32 626, i32 115, i32 627, i32 116, i32 628, i32 117, i32 629, i32 118, i32 630, i32 119, i32 631, i32 120, i32 632, i32 121, i32 633, i32 122, i32 634, i32 123, i32 635, i32 124, i32 636, i32 125, i32 637, i32 126, i32 638, i32 127, i32 639, i32 128, i32 640, i32 129, i32 641, i32 130, i32 642, i32 131, i32 643, i32 132, i32 644, i32 133, i32 645, i32 134, i32 646, i32 135, i32 647, i32 136, i32 648, i32 137, i32 649, i32 138, i32 650, i32 139, i32 651, i32 140, i32 652, i32 141, i32 653, i32 142, i32 654, i32 143, i32 655, i32 144, i32 656, i32 145, i32 657, i32 146, i32 658, i32 147, i32 659, i32 148, i32 660, i32 149, i32 661, i32 150, i32 662, i32 151, i32 663, i32 152, i32 664, i32 153, i32 665, i32 154, i32 666, i32 155, i32 667, i32 156, i32 668, i32 157, i32 669, i32 158, i32 670, i32 159, i32 671, i32 160, i32 672, i32 161, i32 673, i32 162, i32 674, i32 163, i32 675, i32 164, i32 676, i32 165, i32 677, i32 166, i32 678, i32 167, i32 679, i32 168, i32 680, i32 169, i32 681, i32 170, i32 682, i32 171, i32 683, i32 172, i32 684, i32 173, i32 685, i32 174, i32 686, i32 175, i32 687, i32 176, i32 688, i32 177, i32 689, i32 178, i32 690, i32 179, i32 691, i32 180, i32 692, i32 181, i32 693, i32 182, i32 694, i32 183, i32 695, i32 184, i32 696, i32 185, i32 697, i32 186, i32 698, i32 187, i32 699, i32 188, i32 700, i32 189, i32 701, i32 190, i32 702, i32 191, i32 703, i32 192, i32 704, i32 193, i32 705, i32 194, i32 706, i32 195, i32 707, i32 196, i32 708, i32 197, i32 709, i32 198, i32 710, i32 199, i32 711, i32 200, i32 712, i32 201, i32 713, i32 202, i32 714, i32 203, i32 715, i32 204, i32 716, i32 205, i32 717, i32 206, i32 718, i32 207, i32 719, i32 208, i32 720, i32 209, i32 721, i32 210, i32 722, i32 211, i32 723, i32 212, i32 724, i32 213, i32 725, i32 214, i32 726, i32 215, i32 727, i32 216, i32 728, i32 217, i32 729, i32 218, i32 730, i32 219, i32 731, i32 220, i32 732, i32 221, i32 733, i32 222, i32 734, i32 223, i32 735, i32 224, i32 736, i32 225, i32 737, i32 226, i32 738, i32 227, i32 739, i32 228, i32 740, i32 229, i32 741, i32 230, i32 742, i32 231, i32 743, i32 232, i32 744, i32 233, i32 745, i32 234, i32 746, i32 235, i32 747, i32 236, i32 748, i32 237, i32 749, i32 238, i32 750, i32 239, i32 751, i32 240, i32 752, i32 241, i32 753, i32 242, i32 754, i32 243, i32 755, i32 244, i32 756, i32 245, i32 757, i32 246, i32 758, i32 247, i32 759, i32 248, i32 760, i32 249, i32 761, i32 250, i32 762, i32 251, i32 763, i32 252, i32 764, i32 253, i32 765, i32 254, i32 766, i32 255, i32 767, i32 256, i32 768, i32 257, i32 769, i32 258, i32 770, i32 259, i32 771, i32 260, i32 772, i32 261, i32 773, i32 262, i32 774, i32 263, i32 775, i32 264, i32 776, i32 265, i32 777, i32 266, i32 778, i32 267, i32 779, i32 268, i32 780, i32 269, i32 781, i32 270, i32 782, i32 271, i32 783, i32 272, i32 784, i32 273, i32 785, i32 274, i32 786, i32 275, i32 787, i32 276, i32 788, i32 277, i32 789, i32 278, i32 790, i32 279, i32 791, i32 280, i32 792, i32 281, i32 793, i32 282, i32 794, i32 283, i32 795, i32 284, i32 796, i32 285, i32 797, i32 286, i32 798, i32 287, i32 799, i32 288, i32 800, i32 289, i32 801, i32 290, i32 802, i32 291, i32 803, i32 292, i32 804, i32 293, i32 805, i32 294, i32 806, i32 295, i32 807, i32 296, i32 808, i32 297, i32 809, i32 298, i32 810, i32 299, i32 811, i32 300, i32 812, i32 301, i32 813, i32 302, i32 814, i32 303, i32 815, i32 304, i32 816, i32 305, i32 817, i32 306, i32 818, i32 307, i32 819, i32 308, i32 820, i32 309, i32 821, i32 310, i32 822, i32 311, i32 823, i32 312, i32 824, i32 313, i32 825, i32 314, i32 826, i32 315, i32 827, i32 316, i32 828, i32 317, i32 829, i32 318, i32 830, i32 319, i32 831, i32 320, i32 832, i32 321, i32 833, i32 322, i32 834, i32 323, i32 835, i32 324, i32 836, i32 325, i32 837, i32 326, i32 838, i32 327, i32 839, i32 328, i32 840, i32 329, i32 841, i32 330, i32 842, i32 331, i32 843, i32 332, i32 844, i32 333, i32 845, i32 334, i32 846, i32 335, i32 847, i32 336, i32 848, i32 337, i32 849, i32 338, i32 850, i32 339, i32 851, i32 340, i32 852, i32 341, i32 853, i32 342, i32 854, i32 343, i32 855, i32 344, i32 856, i32 345, i32 857, i32 346, i32 858, i32 347, i32 859, i32 348, i32 860, i32 349, i32 861, i32 350, i32 862, i32 351, i32 863, i32 352, i32 864, i32 353, i32 865, i32 354, i32 866, i32 355, i32 867, i32 356, i32 868, i32 357, i32 869, i32 358, i32 870, i32 359, i32 871, i32 360, i32 872, i32 361, i32 873, i32 362, i32 874, i32 363, i32 875, i32 364, i32 876, i32 365, i32 877, i32 366, i32 878, i32 367, i32 879, i32 368, i32 880, i32 369, i32 881, i32 370, i32 882, i32 371, i32 883, i32 372, i32 884, i32 373, i32 885, i32 374, i32 886, i32 375, i32 887, i32 376, i32 888, i32 377, i32 889, i32 378, i32 890, i32 379, i32 891, i32 380, i32 892, i32 381, i32 893, i32 382, i32 894, i32 383, i32 895, i32 384, i32 896, i32 385, i32 897, i32 386, i32 898, i32 387, i32 899, i32 388, i32 900, i32 389, i32 901, i32 390, i32 902, i32 391, i32 903, i32 392, i32 904, i32 393, i32 905, i32 394, i32 906, i32 395, i32 907, i32 396, i32 908, i32 397, i32 909, i32 398, i32 910, i32 399, i32 911, i32 400, i32 912, i32 401, i32 913, i32 402, i32 914, i32 403, i32 915, i32 404, i32 916, i32 405, i32 917, i32 406, i32 918, i32 407, i32 919, i32 408, i32 920, i32 409, i32 921, i32 410, i32 922, i32 411, i32 923, i32 412, i32 924, i32 413, i32 925, i32 414, i32 926, i32 415, i32 927, i32 416, i32 928, i32 417, i32 929, i32 418, i32 930, i32 419, i32 931, i32 420, i32 932, i32 421, i32 933, i32 422, i32 934, i32 423, i32 935, i32 424, i32 936, i32 425, i32 937, i32 426, i32 938, i32 427, i32 939, i32 428, i32 940, i32 429, i32 941, i32 430, i32 942, i32 431, i32 943, i32 432, i32 944, i32 433, i32 945, i32 434, i32 946, i32 435, i32 947, i32 436, i32 948, i32 437, i32 949, i32 438, i32 950, i32 439, i32 951, i32 440, i32 952, i32 441, i32 953, i32 442, i32 954, i32 443, i32 955, i32 444, i32 956, i32 445, i32 957, i32 446, i32 958, i32 447, i32 959, i32 448, i32 960, i32 449, i32 961, i32 450, i32 962, i32 451, i32 963, i32 452, i32 964, i32 453, i32 965, i32 454, i32 966, i32 455, i32 967, i32 456, i32 968, i32 457, i32 969, i32 458, i32 970, i32 459, i32 971, i32 460, i32 972, i32 461, i32 973, i32 462, i32 974, i32 463, i32 975, i32 464, i32 976, i32 465, i32 977, i32 466, i32 978, i32 467, i32 979, i32 468, i32 980, i32 469, i32 981, i32 470, i32 982, i32 471, i32 983, i32 472, i32 984, i32 473, i32 985, i32 474, i32 986, i32 475, i32 987, i32 476, i32 988, i32 477, i32 989, i32 478, i32 990, i32 479, i32 991, i32 480, i32 992, i32 481, i32 993, i32 482, i32 994, i32 483, i32 995, i32 484, i32 996, i32 485, i32 997, i32 486, i32 998, i32 487, i32 999, i32 488, i32 1000, i32 489, i32 1001, i32 490, i32 1002, i32 491, i32 1003, i32 492, i32 1004, i32 493, i32 1005, i32 494, i32 1006, i32 495, i32 1007, i32 496, i32 1008, i32 497, i32 1009, i32 498, i32 1010, i32 499, i32 1011, i32 500, i32 1012, i32 501, i32 1013, i32 502, i32 1014, i32 503, i32 1015, i32 504, i32 1016, i32 505, i32 1017, i32 506, i32 1018, i32 507, i32 1019, i32 508, i32 1020, i32 509, i32 1021, i32 510, i32 1022, i32 511, i32 1023>
+  %26 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %26, <4 x float>* undef, align 128
+  %27 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+  store <4 x float> %27, <4 x float>* undef, align 128
+  %28 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 164, i32 165, i32 166, i32 167>
+  store <4 x float> %28, <4 x float>* undef, align 128
+  %29 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 168, i32 169, i32 170, i32 171>
+  store <4 x float> %29, <4 x float>* undef, align 128
+  %30 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 172, i32 173, i32 174, i32 175>
+  store <4 x float> %30, <4 x float>* undef, align 128
+  %31 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 176, i32 177, i32 178, i32 179>
+  store <4 x float> %31, <4 x float>* undef, align 128
+  %32 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 284, i32 285, i32 286, i32 287>
+  store <4 x float> %32, <4 x float>* undef, align 128
+  %33 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 328, i32 329, i32 330, i32 331>
+  store <4 x float> %33, <4 x float>* undef, align 128
+  %34 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 332, i32 333, i32 334, i32 335>
+  store <4 x float> %34, <4 x float>* undef, align 128
+  %35 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 524, i32 525, i32 526, i32 527>
+  store <4 x float> %35, <4 x float>* undef, align 128
+  %36 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 528, i32 529, i32 530, i32 531>
+  store <4 x float> %36, <4 x float>* undef, align 128
+  %37 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 648, i32 649, i32 650, i32 651>
+  store <4 x float> %37, <4 x float>* undef, align 128
+  %38 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 652, i32 653, i32 654, i32 655>
+  store <4 x float> %38, <4 x float>* undef, align 128
+  %39 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 656, i32 657, i32 658, i32 659>
+  store <4 x float> %39, <4 x float>* undef, align 128
+  %40 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 732, i32 733, i32 734, i32 735>
+  store <4 x float> %40, <4 x float>* undef, align 128
+  %41 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 736, i32 737, i32 738, i32 739>
+  store <4 x float> %41, <4 x float>* undef, align 128
+  %42 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 740, i32 741, i32 742, i32 743>
+  store <4 x float> %42, <4 x float>* undef, align 128
+  %43 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 872, i32 873, i32 874, i32 875>
+  store <4 x float> %43, <4 x float>* undef, align 128
+  %44 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 968, i32 969, i32 970, i32 971>
+  store <4 x float> %44, <4 x float>* undef, align 128
+  %45 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1016, i32 1017, i32 1018, i32 1019>
+  store <4 x float> %45, <4 x float>* undef, align 128
+  %46 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1020, i32 1021, i32 1022, i32 1023>
+  store <4 x float> %46, <4 x float>* undef, align 128
+  %47 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  store <4 x float> %47, <4 x float>* undef, align 128
+  %48 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  store <4 x float> %48, <4 x float>* undef, align 128
+  %49 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+  store <4 x float> %49, <4 x float>* undef, align 128
+  %50 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+  store <4 x float> %50, <4 x float>* undef, align 128
+  %51 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 148, i32 149, i32 150, i32 151>
+  store <4 x float> %51, <4 x float>* undef, align 128
+  %52 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 632, i32 633, i32 634, i32 635>
+  store <4 x float> %52, <4 x float>* undef, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
new file mode 100644
index 000000000000..5f3c9278f8d5
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+
+define i128 @test_abs(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_abs:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[MASK_REG:[0-9]+]], 1
+; PPC64: sldi [[MASK_REG]], [[MASK_REG]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64: and [[FLIP_BIT:[0-9]+]], [[HI]], [[MASK_REG]]
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI0]], 0, 0, 0
+; PPC32-DAG: xor [[HI0]], [[HI0]], [[FLIP_BIT]]
+; PPC32-DAG: xor [[LO0]], [[LO0]], [[FLIP_BIT]]
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_neg(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_neg:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[FLIP_BIT:[0-9]+]], 1
+; PPC64-DAG: sldi [[FLIP_BIT]], [[FLIP_BIT]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64-NOT: BARRIER
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32-NOT: BARRIER
+; PPC32-DAG: xoris [[HI0]], [[HI0]], 32768
+; PPC32-DAG: xoris [[LO0]], [[LO0]], 32768
+; PPC32: blr
+	%0 = fsub ppc_fp128 0xM80000000000000000000000000000000, %x
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_copysign(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_copysign:
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_REG:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_REG]], 1, [[OFFSET:-?[0-9]+]]
+; PPC64-DAG: li [[SIGN:[0-9]+]], 1
+; PPC64-DAG: sldi [[SIGN]], [[SIGN]], 63
+; PPC64-DAG: li [[HI_TMP:[0-9]+]], 16399
+; PPC64-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
+; PPC64-DAG: li [[LO_TMP:[0-9]+]], 3019
+; PPC64-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
+; PPC64-NOT: BARRIER
+; PPC64-DAG: ld [[X_HI:[0-9]+]], [[OFFSET]](1)
+; PPC64-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
+; PPC64-DAG: xor 4, [[SIGN]], [[CST_LO]]
+; PPC64: blr
+
+; PPC32: stfd 1, [[STACK:[0-9]+]](1)
+; PPC32: nop
+; PPC32: lwz [[HI:[0-9]+]], [[STACK]](1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0, 0
+; PPC32-NOT: BARRIER
+; PPC32-DAG: oris {{[0-9]+}}, [[FLIP_BIT]], 16399
+; PPC32-DAG: xoris {{[0-9]+}}, [[FLIP_BIT]], 48304
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128 0xMBCB0000000000000400F000000000000, ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128)
+declare ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128, ppc_fp128)
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 7e8991647aee..8da8df58a85c 100644
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -1036,7 +1036,7 @@ entry:
 ; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
 ; CHECK-DAG: li [[IMM3:[0-9]+]], 3
 ; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
-; CHECK-DAG: rldicr [[SHL:[0-9]+]], [[ANDC]], 4, 60
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
 ; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
 ; CHECK-DAG: extsh 3, 3
 ; CHECK-LE-LABEL: @getvelss
@@ -1072,7 +1072,7 @@ entry:
 ; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
 ; CHECK-DAG: li [[IMM3:[0-9]+]], 3
 ; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
-; CHECK-DAG: rldicr [[SHL:[0-9]+]], [[ANDC]], 4, 60
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
 ; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
 ; CHECK-DAG: clrldi   3, 3, 48
 ; CHECK-LE-LABEL: @getvelus
diff --git a/test/CodeGen/PowerPC/peephole-align.ll b/test/CodeGen/PowerPC/peephole-align.ll
new file mode 100644
index 000000000000..c8c2fe4d32ce
--- /dev/null
+++ b/test/CodeGen/PowerPC/peephole-align.ll
@@ -0,0 +1,335 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER7 -check-prefix=CHECK %s
+; RUN: llc -mcpu=pwr8 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER8 -check-prefix=CHECK %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading and storing small offsets within aligned values.
+; For power8, verify that the optimization doesn't fire, as it prevents fusion
+; opportunities.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.b4 = type<{ i8, i8, i8, i8 }>
+%struct.h2 = type<{ i16, i16 }>
+
+%struct.b8 = type<{ i8, i8, i8, i8, i8, i8, i8, i8 }>
+%struct.h4 = type<{ i16, i16, i16, i16 }>
+%struct.w2 = type<{ i32, i32 }>
+
+%struct.d2 = type<{ i64, i64 }>
+%struct.misalign = type<{ i8, i64 }>
+
+@b4v = global %struct.b4 <{ i8 1, i8 2, i8 3, i8 4 }>, align 4
+@h2v = global %struct.h2 <{ i16 1, i16 2 }>, align 4
+
+@b8v = global %struct.b8 <{ i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8 }>, align 8
+@h4v = global %struct.h4 <{ i16 1, i16 2, i16 3, i16 4 }>, align 8
+@w2v = global %struct.w2 <{ i32 1, i32 2 }>, align 8
+
+@d2v = global %struct.d2 <{ i64 1, i64 2 }>, align 16
+@misalign_v = global %struct.misalign <{ i8 1, i64 2 }>, align 16
+
+; CHECK-LABEL: test_b4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b4v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: stb [[REG0_1]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b4v@toc@l+3([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b4v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+define void @test_b4() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h2v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+define void @test_h2() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_h2_optsize:
+; CHECK: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; CHECK-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; CHECK-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; CHECK-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; CHECK-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+define void @test_h2_optsize() optsize nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_b8:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG4_0:[0-9]+]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG5_0:[0-9]+]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG6_0:[0-9]+]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG7_0:[0-9]+]], b8v@toc@l+7([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER7-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER7-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER7-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER7-DAG: stb [[REG0_1]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG4_1]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG5_1]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG6_1]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG7_1]], b8v@toc@l+7([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b8v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG4_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG5_0:[0-9]+]], 5([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG6_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG7_0:[0-9]+]], 7([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER8-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER8-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER8-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG4_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG5_1]], 5([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG6_1]], 6([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG7_1]], 7([[REGSTRUCT]])
+define void @test_b8() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %4 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %inc4 = add nsw i8 %4, 5
+  store i8 %inc4, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %5 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %inc5 = add nsw i8 %5, 6
+  store i8 %inc5, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %6 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %inc6 = add nsw i8 %6, 7
+  store i8 %inc6, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %7 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  %inc7 = add nsw i8 %7, 8
+  store i8 %inc7, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG2_0:[0-9]+]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG3_0:[0-9]+]], h4v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: sth [[REG0_1]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG2_1]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG3_1]], h4v@toc@l+6([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h4v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG2_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG3_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG2_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG3_1]], 6([[REGSTRUCT]])
+define void @test_h4() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %2 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %inc2 = add nsw i16 %2, 3
+  store i16 %inc2, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %3 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  %inc3 = add nsw i16 %3, 4
+  store i16 %inc3, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_w2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER7-DAG: lwz [[REG0_0:[0-9]+]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lwz [[REG1_0:[0-9]+]], w2v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: stw [[REG0_1]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stw [[REG1_1]], w2v@toc@l+4([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], w2v@toc@l
+; POWER8-DAG: lwz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lwz [[REG1_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: stw [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stw [[REG1_1]], 4([[REGSTRUCT]])
+define void @test_w2() nounwind {
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %inc0 = add nsw i32 %0, 1
+  store i32 %inc0, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  %inc1 = add nsw i32 %1, 2
+  store i32 %inc1, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  ret void
+}
+
+; CHECK-LABEL: test_d2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER7-DAG: ld [[REG0_0:[0-9]+]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: ld [[REG1_0:[0-9]+]], d2v@toc@l+8([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: std [[REG0_1]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: std [[REG1_1]], d2v@toc@l+8([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], d2v@toc@l
+; POWER8-DAG: ld [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: ld [[REG1_0:[0-9]+]], 8([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: std [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: std [[REG1_1]], 8([[REGSTRUCT]])
+define void @test_d2() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %1 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  %inc1 = add nsw i64 %1, 2
+  store i64 %inc1, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret void
+}
+
+; Make sure the optimization fires on power8 if there is a single use resulting
+; in a better fusion opportunity.
+; register 3 is the return value, so it should be chosen
+; CHECK-LABEL: test_singleuse:
+; CHECK: addis 3, 2, d2v@toc@ha
+; CHECK: ld 3, d2v@toc@l+8(3)
+define i64 @test_singleuse() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret i64 %0
+}
+
+; Make sure the optimization fails to fire if the symbol is aligned, but the offset is not.
+; CHECK-LABEL: test_misalign
+; POWER7: addis [[REGSTRUCT_0:[0-9]+]], 2, misalign_v@toc@ha
+; POWER7: addi [[REGSTRUCT:[0-9]+]], [[REGSTRUCT_0]], misalign_v@toc@l
+; POWER7: li [[OFFSET_REG:[0-9]+]], 1
+; POWER7: ldx [[REG0_0:[0-9]+]], [[REGSTRUCT]], [[OFFSET_REG]]
+; POWER7: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7: stdx [[REG0_1]], [[REGSTRUCT]], [[OFFSET_REG]]
+define void @test_misalign() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
new file mode 100644
index 000000000000..3d4789360f55
--- /dev/null
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-BE
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-P7
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @geti(<4 x i32> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %b
+  ret i32 %vecext
+; CHECK-LABEL: @geti
+; CHECK-P7-LABEL: @geti
+; CHECK-BE-LABEL: @geti
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 2
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 2
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK-DAG: li [[ONEREG:[0-9]+]], 1
+; CHECK-DAG: and [[ELEMSREG:[0-9]+]], [[ONEREG]], 5
+; CHECK-DAG: sldi [[SHAMREG:[0-9]+]], [[ELEMSREG]], 5
+; CHECK: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECK: extsw 3, [[RSHREG]]
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lwax 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE-DAG: li [[IMMREG:[0-9]+]], 1
+; CHECK-BE-DAG: andc [[ANDCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK-BE-DAG: sldi [[SHAMREG:[0-9]+]], [[ANDCREG]], 5
+; CHECK-BE: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK-BE: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECk-BE: extsw 3, [[RSHREG]]
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getl(<2 x i64> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x i64> %a, i32 %b
+  ret i64 %vecext
+; CHECK-LABEL: @getl
+; CHECK-P7-LABEL: @getl
+; CHECK-BE-LABEL: @getl
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 1
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK: mfvsrd 3,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: stxvd2x 34,
+; CHECK-P7: ldx 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: mfvsrd 3,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define float @getf(<4 x float> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x float> %a, i32 %b
+  ret float %vecext
+; CHECK-LABEL: @getf
+; CHECK-P7-LABEL: @getf
+; CHECK-BE-LABEL: @getf
+; CHECK: li [[IMMREG:[0-9]+]], 3
+; CHECK: xor [[TRUNCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK: xscvspdpn 1,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
+; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
+; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]
+; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: xscvspdpn 1,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define double @getd(<2 x double> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x double> %a, i32 %b
+  ret double %vecext
+; CHECK-LABEL: @getd
+; CHECK-P7-LABEL: @getd
+; CHECK-BE-LABEL: @getd
+; CHECK: li [[TRUNCREG:[0-9]+]], 1
+; CHECK: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK: xxlor 1,
+; CHECK-P7-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-P7-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-P7-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-P7-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-P7: xxlor 1,
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-BE: xxlor 1,
+}
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index 0d534589ae0a..c5d1044e9d69 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -32,10 +32,10 @@ define void @test100() {
 ; Smallest stack for which we use a constant pool
 define void @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: ldr r0,
-; CHECK: add sp, r0
-; EABI: ldr r0,
-; EABI: add sp, r0
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %tmp = alloca [ 1528 x i8 ] , align 4
@@ -44,12 +44,12 @@ define void @test2() {
 
 define i32 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: ldr r1,
-; CHECK: add sp, r1
-; CHECK: ldr r1,
-; CHECK: add r1, sp
-; EABI: ldr r1,
-; EABI: add sp, r1
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; CHECK: ldr [[TEMP]],
+; CHECK: add [[TEMP]], sp
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %retval = alloca i32, align 4
diff --git a/test/CodeGen/Thumb/pop-special-fixup.ll b/test/CodeGen/Thumb/pop-special-fixup.ll
deleted file mode 100644
index 9ba589d6cec3..000000000000
--- a/test/CodeGen/Thumb/pop-special-fixup.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s
-
-target triple = "thumbv6m-none-none-eabi"
-
-@retval = global i32 0, align 4
-
-define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) {
-  %1 = icmp sgt i32 %argc, %i
-  br i1 %1, label %2, label %19
-
-  %3 = getelementptr inbounds i8*, i8** %argv, i32 %i
-  %4 = load i8*, i8** %3, align 4
-  %5 = load i8, i8* %4, align 1
-  %6 = icmp eq i8 %5, 45
-  %7 = getelementptr inbounds i8, i8* %4, i32 1
-  %. = select i1 %6, i8* %7, i8* %4
-  %.1 = select i1 %6, i32 -1, i32 1
-  %8 = load i8, i8* %., align 1
-  %.off2 = add i8 %8, -48
-  %9 = icmp ult i8 %.off2, 10
-  %.pre = load i32, i32* @retval, align 4
-  br i1 %9, label %.lr.ph.preheader, label %.critedge
-
-.lr.ph.preheader:                                 ; preds = %2
-  br label %.lr.ph
-
-.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
-  %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ]
-  %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ]
-  %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ]
-  %12 = zext i8 %11 to i32
-  %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1
-  %14 = add nsw i32 %10, %12
-  store i32 %14, i32* @retval, align 4
-  %15 = load i8, i8* %13, align 1
-  %.off = add i8 %15, -48
-  %16 = icmp ult i8 %.off, 10
-  br i1 %16, label %.lr.ph, label %.critedge.loopexit
-
-.critedge.loopexit:                               ; preds = %.lr.ph
-  %.lcssa = phi i32 [ %14, %.lr.ph ]
-  br label %.critedge
-
-.critedge:                                        ; preds = %.critedge.loopexit, %2
-  %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ]
-  %18 = mul nsw i32 %17, %.1
-  store i32 %18, i32* @retval, align 4
-  br label %19
-
-; <label>:19                                      ; preds = %.critedge, %0
-  ret i32 0
-}
-
-; CHECK: push {r4, r5, r7, lr}
-; CHECK: pop {r4, r5, r7}
-; CHECK: pop {r0}
-; CHECK: mov lr, r0
-; CHECK: movs r0, #0
-; CHECK: bx  lr
-
diff --git a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
index 09c2ae3b4f72..e68ca0bd78c9 100644
--- a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,7 +1,11 @@
 ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
-; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE 
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
-; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -39,14 +43,20 @@
 ;
 ; With shrink-wrapping, epilogue is just after the call.
 ; ENABLE-NEXT: add sp, #8
-; ENABLE-NEXT: pop {r7, lr}
+; ENABLE-V5T-NEXT: pop {r7, pc}
+; ENABLE-V4T-NEXT: pop {r7}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: mov lr, r1
 ;
 ; CHECK: [[EXIT_LABEL]]:
 ;
 ; Without shrink-wrapping, epilogue is in the exit block.
 ; Epilogue code. (What we pop does not matter.)
 ; DISABLE: add sp, #8
-; DISABLE-NEXT: pop {r7, pc}
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @foo(i32 %a, i32 %b) {
@@ -64,6 +74,42 @@ false:
   ret i32 %tmp.0
 }
 
+
+; Same, but the final BB is non-trivial, so we don't duplicate the return inst.
+; CHECK-LABEL: bar:
+;
+; With shrink-wrapping, epilogue is just after the call.
+; CHECK: bl
+; ENABLE-NEXT: add sp, #8
+; ENABLE-NEXT: pop {r7}
+; ENABLE-NEXT: pop {r0}
+; ENABLE-NEXT: mov lr, r0
+;
+; CHECK: movs r0, #42
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE: add sp, #8
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-NEXT: bx lr
+define i32 @bar(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  ret i32 42
+}
+
 ; Function Attrs: optsize
 declare i32 @doSomething(i32, i32*)
 
@@ -101,12 +147,17 @@ declare i32 @doSomething(i32, i32*)
 ; CHECK: lsls [[SUM]], [[SUM]], #3
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
@@ -222,12 +273,17 @@ for.end:                                          ; preds = %for.body
 ; ENABLE-NEXT: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
@@ -297,12 +353,17 @@ declare void @somethingElse(...)
 ; ENABLE: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
@@ -373,12 +434,17 @@ entry:
 ; ENABLE-NEXT: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @inlineAsm(i32 %cond, i32 %N) {
@@ -438,12 +504,14 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: pop {r3}
 ; CHECK-NEXT: bl
 ; CHECK-NEXT: lsls r0, r0, #3
-; CHECK-NEXT: add sp, #16
 ;
+; ENABLE-NEXT: add sp, #16
 ; ENABLE-NEXT: pop {[[TMP]], lr}
 ;
 ; Duplicated epilogue.
-; DISABLE-NEXT: pop {[[TMP]], pc}
+; DISABLE-V5T-NEXT: add sp, #16
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
@@ -452,8 +520,12 @@ if.end:                                           ; preds = %for.body, %if.else
 ; Epilogue code.
 ; ENABLE-NEXT: bx lr
 ;
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
 ; DISABLE-NEXT: add sp, #16
-; DISABLE-NEXT: pop {[[TMP]], pc}
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: pop {[[TMP]]}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 define i32 @callVariadicFunc(i32 %cond, i32 %N) {
 entry:
   %tobool = icmp eq i32 %cond, 0
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 4a53f6e4bb93..4c86c55a8f85 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-block-placement | FileCheck %s
+; RUN: llc < %s -asm-verbose=false | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 
@@ -12,10 +13,21 @@ declare void @something()
 ; CHECK-LABEL: test0:
 ; CHECK: loop
 ; CHECK: i32.add
+; CHECK-NOT: br
 ; CHECK: br_if
+; CHECK-NOT: br
 ; CHECK: call
 ; CHECK: br BB0_1{{$}}
 ; CHECK: return{{$}}
+; OPT-LABEL: test0:
+; OPT: loop
+; OPT: i32.add
+; OPT-NOT: br
+; OPT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB0_1{{$}}
+; OPT: return{{$}}
 define void @test0(i32 %n) {
 entry:
   br label %header
@@ -40,10 +52,21 @@ back:
 ; CHECK-LABEL: test1:
 ; CHECK: loop
 ; CHECK: i32.add
+; CHECK-NOT: br
 ; CHECK: br_if
+; CHECK-NOT: br
 ; CHECK: call
 ; CHECK: br BB1_1{{$}}
 ; CHECK: return{{$}}
+; OPT-LABEL: test1:
+; OPT: loop
+; OPT: i32.add
+; OPT-NOT: br
+; OPT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB1_1{{$}}
+; OPT: return{{$}}
 define void @test1(i32 %n) {
 entry:
   br label %header
@@ -67,11 +90,18 @@ back:
 
 ; CHECK-LABEL: test2:
 ; CHECK: block BB2_2{{$}}
-; CHECK: br_if {{.*}}, BB2_2{{$}}
+; CHECK: br_if {{[^,]*}}, BB2_2{{$}}
 ; CHECK: BB2_1:
-; CHECK: br_if $pop{{[0-9]+}}, BB2_1{{$}}
+; CHECK: br_if ${{[0-9]+}}, BB2_1{{$}}
 ; CHECK: BB2_2:
 ; CHECK: return{{$}}
+; OPT-LABEL: test2:
+; OPT: block BB2_2{{$}}
+; OPT: br_if {{[^,]*}}, BB2_2{{$}}
+; OPT: BB2_1:
+; OPT: br_if ${{[0-9]+}}, BB2_1{{$}}
+; OPT: BB2_2:
+; OPT: return{{$}}
 define void @test2(double* nocapture %p, i32 %n) {
 entry:
   %cmp.4 = icmp sgt i32 %n, 0
@@ -100,13 +130,23 @@ for.end:
 ; CHECK-LABEL: doublediamond:
 ; CHECK: block BB3_5{{$}}
 ; CHECK: block BB3_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB3_2{{$}}
+; CHECK: br_if $0, BB3_2{{$}}
 ; CHECK: block BB3_4{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB3_4{{$}}
+; CHECK: br_if $1, BB3_4{{$}}
 ; CHECK: br BB3_5{{$}}
 ; CHECK: BB3_4:
 ; CHECK: BB3_5:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doublediamond:
+; OPT: block BB3_5{{$}}
+; OPT: block BB3_4{{$}}
+; OPT: br_if {{[^,]*}}, BB3_4{{$}}
+; OPT: block BB3_3{{$}}
+; OPT: br_if {{[^,]*}}, BB3_3{{$}}
+; OPT: br BB3_5{{$}}
+; OPT: BB3_4:
+; OPT: BB3_5:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doublediamond(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -132,9 +172,14 @@ exit:
 
 ; CHECK-LABEL: triangle:
 ; CHECK: block BB4_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB4_2{{$}}
+; CHECK: br_if $1, BB4_2{{$}}
 ; CHECK: BB4_2:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: triangle:
+; OPT: block BB4_2{{$}}
+; OPT: br_if $1, BB4_2{{$}}
+; OPT: BB4_2:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @triangle(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -151,11 +196,19 @@ exit:
 ; CHECK-LABEL: diamond:
 ; CHECK: block BB5_3{{$}}
 ; CHECK: block BB5_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB5_2{{$}}
+; CHECK: br_if $1, BB5_2{{$}}
 ; CHECK: br BB5_3{{$}}
 ; CHECK: BB5_2:
 ; CHECK: BB5_3:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: diamond:
+; OPT: block BB5_3{{$}}
+; OPT: block BB5_2{{$}}
+; OPT: br_if {{[^,]*}}, BB5_2{{$}}
+; OPT: br BB5_3{{$}}
+; OPT: BB5_2:
+; OPT: BB5_3:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @diamond(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -175,6 +228,9 @@ exit:
 ; CHECK-LABEL: single_block:
 ; CHECK-NOT: br
 ; CHECK: return $pop{{[0-9]+}}{{$}}
+; OPT-LABEL: single_block:
+; OPT-NOT: br
+; OPT: return $pop{{[0-9]+}}{{$}}
 define i32 @single_block(i32* %p) {
 entry:
   store volatile i32 0, i32* %p
@@ -184,9 +240,15 @@ entry:
 ; CHECK-LABEL: minimal_loop:
 ; CHECK-NOT: br
 ; CHECK: BB7_1:
-; CHECK: i32.store $discard=, $0, $pop{{[0-9]+}}{{$}}
+; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
 ; CHECK: br BB7_1{{$}}
 ; CHECK: BB7_2:
+; OPT-LABEL: minimal_loop:
+; OPT-NOT: br
+; OPT: BB7_1:
+; OPT: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; OPT: br BB7_1{{$}}
+; OPT: BB7_2:
 define i32 @minimal_loop(i32* %p) {
 entry:
   store volatile i32 0, i32* %p
@@ -203,6 +265,13 @@ loop:
 ; CHECK: br_if $pop{{[0-9]+}}, BB8_1{{$}}
 ; CHECK: BB8_2:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: simple_loop:
+; OPT-NOT: br
+; OPT: BB8_1:
+; OPT: loop BB8_2{{$}}
+; OPT: br_if {{[^,]*}}, BB8_1{{$}}
+; OPT: BB8_2:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @simple_loop(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -218,12 +287,20 @@ exit:
 
 ; CHECK-LABEL: doubletriangle:
 ; CHECK: block BB9_4{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB9_4{{$}}
+; CHECK: br_if $0, BB9_4{{$}}
 ; CHECK: block BB9_3{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB9_3{{$}}
+; CHECK: br_if $1, BB9_3{{$}}
 ; CHECK: BB9_3:
 ; CHECK: BB9_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doubletriangle:
+; OPT: block BB9_4{{$}}
+; OPT: br_if $0, BB9_4{{$}}
+; OPT: block BB9_3{{$}}
+; OPT: br_if $1, BB9_3{{$}}
+; OPT: BB9_3:
+; OPT: BB9_4:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doubletriangle(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -247,12 +324,21 @@ exit:
 ; CHECK-LABEL: ifelse_earlyexits:
 ; CHECK: block BB10_4{{$}}
 ; CHECK: block BB10_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB10_2{{$}}
+; CHECK: br_if $0, BB10_2{{$}}
 ; CHECK: br BB10_4{{$}}
 ; CHECK: BB10_2:
-; CHECK: br_if $pop{{[0-9]+}}, BB10_4{{$}}
+; CHECK: br_if $1, BB10_4{{$}}
 ; CHECK: BB10_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: ifelse_earlyexits:
+; OPT: block BB10_4{{$}}
+; OPT: block BB10_3{{$}}
+; OPT: br_if {{[^,]*}}, BB10_3{{$}}
+; OPT: br_if $1, BB10_4{{$}}
+; OPT: br BB10_4{{$}}
+; OPT: BB10_3:
+; OPT: BB10_4:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @ifelse_earlyexits(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -278,16 +364,31 @@ exit:
 ; CHECK: loop            BB11_7{{$}}
 ; CHECK: block           BB11_6{{$}}
 ; CHECK: block           BB11_3{{$}}
-; CHECK: br_if           $pop{{.*}}, BB11_3{{$}}
+; CHECK: br_if           $0, BB11_3{{$}}
 ; CHECK: br              BB11_6{{$}}
 ; CHECK: BB11_3:
 ; CHECK: block           BB11_5{{$}}
-; CHECK: br_if           $pop{{.*}}, BB11_5{{$}}
+; CHECK: br_if           $1, BB11_5{{$}}
 ; CHECK: br              BB11_6{{$}}
 ; CHECK: BB11_5:
 ; CHECK: BB11_6:
 ; CHECK: br              BB11_1{{$}}
 ; CHECK: BB11_7:
+; OPT-LABEL: doublediamond_in_a_loop:
+; OPT: BB11_1:
+; OPT: loop            BB11_7{{$}}
+; OPT: block           BB11_6{{$}}
+; OPT: block           BB11_5{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_5{{$}}
+; OPT: block           BB11_4{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_4{{$}}
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_4:
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_5:
+; OPT: BB11_6:
+; OPT: br              BB11_1{{$}}
+; OPT: BB11_7:
 define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
 entry:
   br label %header
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index 2674a335a137..e1acaca2c9ec 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -214,3 +214,42 @@ define float @f32_demote_f64(double %x) {
   %a = fptrunc double %x to float
   ret float %a
 }
+
+; If the high its are unused, LLVM will optimize sext/zext into anyext, which
+; we need to patterm-match back to a specific instruction.
+
+; CHECK-LABEL: anyext:
+; CHECK: i64.extend_u/i32 $push0=, $0{{$}}
+define i64 @anyext(i32 %x) {
+    %y = sext i32 %x to i64
+    %w = shl i64 %y, 32
+    ret i64 %w
+}
+
+; CHECK-LABEL: bitcast_i32_to_float:
+; CHECK: f32.reinterpret/i32   $push0=, $0{{$}}
+define float @bitcast_i32_to_float(i32 %a) {
+  %t = bitcast i32 %a to float
+  ret float %t
+}
+
+; CHECK-LABEL: bitcast_float_to_i32:
+; CHECK: i32.reinterpret/f32   $push0=, $0{{$}}
+define i32 @bitcast_float_to_i32(float %a) {
+  %t = bitcast float %a to i32
+  ret i32 %t
+}
+
+; CHECK-LABEL: bitcast_i64_to_double:
+; CHECK: f64.reinterpret/i64   $push0=, $0{{$}}
+define double @bitcast_i64_to_double(i64 %a) {
+  %t = bitcast i64 %a to double
+  ret double %t
+}
+
+; CHECK-LABEL: bitcast_double_to_i64:
+; CHECK: i64.reinterpret/f64   $push0=, $0{{$}}
+define i64 @bitcast_double_to_i64(double %a) {
+  %t = bitcast double %a to i64
+  ret i64 %t
+}
diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
new file mode 100644
index 000000000000..760e49133018
--- /dev/null
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; DAGCombiner oddly folds casts into the rhs of copysign. Test that they get
+; unfolded.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
+
+; CHECK-LABEL: fold_promote:
+; CHECK: f64.promote/f32 $push0=, $1{{$}}
+; CHECK: f64.copysign    $push1=, $0, $pop0{{$}}
+define double @fold_promote(double %a, float %b) {
+  %c = fpext float %b to double
+  %t = call double @copysign(double %a, double %c)
+  ret double %t
+}
+
+; CHECK-LABEL: fold_demote:{{$}}
+; CHECK: f32.demote/f64  $push0=, $1{{$}}
+; CHECK: f32.copysign    $push1=, $0, $pop0{{$}}
+define float @fold_demote(float %a, double %b) {
+  %c = fptrunc double %b to float
+  %t = call float @copysignf(float %a, float %c)
+  ret float %t
+}
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
index bbc9c8fe4f31..2b77c5f475c8 100644
--- a/test/CodeGen/WebAssembly/cpus.ll
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -9,8 +9,8 @@
 ; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 ; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
-; CHECK-NOT: {{.*}}  is not a recognized processor for this target
-; INVALID: {{.*}}  is not a recognized processor for this target
+; CHECK-NOT: {{.*}} is not a recognized processor for this target
+; INVALID: {{.*}} is not a recognized processor for this target
 
 define i32 @f(i32 %i_like_the_web) {
   ret i32 %i_like_the_web
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index cf1415c1982b..b03e1569fde6 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Check that unused vregs aren't assigned registers.
 
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index a15b37beab84..10fe18560379 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -13,6 +13,7 @@ declare float @llvm.floor.f32(float)
 declare float @llvm.trunc.f32(float)
 declare float @llvm.nearbyint.f32(float)
 declare float @llvm.rint.f32(float)
+declare float @llvm.fma.f32(float, float, float)
 
 ; CHECK-LABEL: fadd32:
 ; CHECK-NEXT: .param f32, f32{{$}}
@@ -143,3 +144,11 @@ define float @fmax32(float %x) {
   %b = select i1 %a, float %x, float 0.0
   ret float %b
 }
+
+; CHECK-LABEL: fma32:
+; CHECK: call $push0=, fmaf, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fma32(float %a, float %b, float %c) {
+  %d = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %d
+}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 1407f713b480..51eb33790a9e 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -13,6 +13,7 @@ declare double @llvm.floor.f64(double)
 declare double @llvm.trunc.f64(double)
 declare double @llvm.nearbyint.f64(double)
 declare double @llvm.rint.f64(double)
+declare double @llvm.fma.f64(double, double, double)
 
 ; CHECK-LABEL: fadd64:
 ; CHECK-NEXT: .param f64, f64{{$}}
@@ -143,3 +144,11 @@ define double @fmax64(double %x) {
   %b = select i1 %a, double %x, double 0.0
   ret double %b
 }
+
+; CHECK-LABEL: fma64:
+; CHECK: call $push0=, fma, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fma64(double %a, double %b, double %c) {
+  %d = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %d
+}
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
new file mode 100644
index 000000000000..43552a9bdf3e
--- /dev/null
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the frem instruction works.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: frem32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: call $push0=, fmodf, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @frem32(float %x, float %y) {
+  %a = frem float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: frem64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: call $push0=, fmod, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @frem64(double %x, double %y) {
+  %a = frem double %x, %y
+  ret double %a
+}
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index 818c454a4914..e00d32b972ec 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -11,7 +11,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK: foo:
 ; CHECK: i32.const $push0=, answer{{$}}
-; CHECK-NEXT: i32.load $push1=, $pop0{{$}}
+; CHECK-NEXT: i32.load $push1=, 0($pop0){{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define i32 @foo() {
   %a = load i32, i32* @answer
@@ -21,8 +21,7 @@ define i32 @foo() {
 ; CHECK-LABEL: call_memcpy:
 ; CHECK-NEXT: .param          i32, i32, i32{{$}}
 ; CHECK-NEXT: .result         i32{{$}}
-; CHECK-NEXT: i32.const       $push0=, memcpy{{$}}
-; CHECK-NEXT: call_indirect   $pop0, $0, $1, $2{{$}}
+; CHECK-NEXT: call            memcpy, $0, $1, $2{{$}}
 ; CHECK-NEXT: return          $0{{$}}
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
 define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index e9d2ebf51f63..ba5800286297 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -56,6 +56,22 @@ entry:
   ret i64 %0
 }
 
+; CHECK-LABEL: X_i16:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+define void @X_i16(i16 * %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
+  ret void
+}
+
+; CHECK-LABEL: X_ptr:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store $discard=, 0($0), $1{{$}}
+define void @X_ptr(i16 ** %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
+  ret void
+}
+
 attributes #0 = { nounwind }
 
 !0 = !{i32 47}
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
new file mode 100644
index 000000000000..7cc1246a86e8
--- /dev/null
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test various types and operators that need to be legalized.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: shl_i3:
+; CHECK: i32.const   $push0=, 7{{$}}
+; CHECK: i32.and     $push1=, $1, $pop0{{$}}
+; CHECK: i32.shl     $push2=, $0, $pop1{{$}}
+define i3 @shl_i3(i3 %a, i3 %b, i3* %p) {
+  %t = shl i3 %a, %b
+  ret i3 %t
+}
+
+; CHECK-LABEL: shl_i53:
+; CHECK: i64.const   $push0=, 9007199254740991{{$}}
+; CHECK: i64.and     $push1=, $1, $pop0{{$}}
+; CHECK: i64.shl     $push2=, $0, $pop1{{$}}
+define i53 @shl_i53(i53 %a, i53 %b, i53* %p) {
+  %t = shl i53 %a, %b
+  ret i53 %t
+}
+
+; CHECK-LABEL: sext_in_reg_i32_i64:
+; CHECK: i64.shl
+; CHECK: i64.shr_s
+define i64 @sext_in_reg_i32_i64(i64 %a) {
+  %b = shl i64 %a, 32
+  %c = ashr i64 %b, 32
+  ret i64 %c
+}
+
+; CHECK-LABEL: fpext_f32_f64:
+; CHECK: f32.load $push0=, 0($0){{$}}
+; CHECK: f64.promote/f32 $push1=, $pop0{{$}}
+; CHECK: return $pop1{{$}}
+define double @fpext_f32_f64(float *%p) {
+  %v = load float, float* %p
+  %e = fpext float %v to double
+  ret double %e
+}
+
+; CHECK-LABEL: fpconv_f64_f32:
+; CHECK: f64.load $push0=, 0($0){{$}}
+; CHECK: f32.demote/f64 $push1=, $pop0{{$}}
+; CHECK: return $pop1{{$}}
+define float @fpconv_f64_f32(double *%p) {
+  %v = load double, double* %p
+  %e = fptrunc double %v to float
+  ret float %e
+}
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
index bdccfff1d161..0ffcd38a8666 100644
--- a/test/CodeGen/WebAssembly/load-ext.ll
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sext_i8_i32:
-; CHECK: i32.load8_s $push0=, $0{{$}}
+; CHECK: i32.load8_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sext_i8_i32(i8 *%p) {
   %v = load i8, i8* %p
@@ -15,7 +15,7 @@ define i32 @sext_i8_i32(i8 *%p) {
 }
 
 ; CHECK-LABEL: zext_i8_i32:
-; CHECK: i32.load8_u $push0=, $0{{$}}
+; CHECK: i32.load8_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @zext_i8_i32(i8 *%p) {
   %v = load i8, i8* %p
@@ -24,7 +24,7 @@ define i32 @zext_i8_i32(i8 *%p) {
 }
 
 ; CHECK-LABEL: sext_i16_i32:
-; CHECK: i32.load16_s $push0=, $0{{$}}
+; CHECK: i32.load16_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sext_i16_i32(i16 *%p) {
   %v = load i16, i16* %p
@@ -33,7 +33,7 @@ define i32 @sext_i16_i32(i16 *%p) {
 }
 
 ; CHECK-LABEL: zext_i16_i32:
-; CHECK: i32.load16_u $push0=, $0{{$}}
+; CHECK: i32.load16_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @zext_i16_i32(i16 *%p) {
   %v = load i16, i16* %p
@@ -42,7 +42,7 @@ define i32 @zext_i16_i32(i16 *%p) {
 }
 
 ; CHECK-LABEL: sext_i8_i64:
-; CHECK: i64.load8_s $push0=, $0{{$}}
+; CHECK: i64.load8_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i8_i64(i8 *%p) {
   %v = load i8, i8* %p
@@ -51,7 +51,7 @@ define i64 @sext_i8_i64(i8 *%p) {
 }
 
 ; CHECK-LABEL: zext_i8_i64:
-; CHECK: i64.load8_u $push0=, $0{{$}}
+; CHECK: i64.load8_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @zext_i8_i64(i8 *%p) {
   %v = load i8, i8* %p
@@ -60,7 +60,7 @@ define i64 @zext_i8_i64(i8 *%p) {
 }
 
 ; CHECK-LABEL: sext_i16_i64:
-; CHECK: i64.load16_s $push0=, $0{{$}}
+; CHECK: i64.load16_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i16_i64(i16 *%p) {
   %v = load i16, i16* %p
@@ -69,7 +69,7 @@ define i64 @sext_i16_i64(i16 *%p) {
 }
 
 ; CHECK-LABEL: zext_i16_i64:
-; CHECK: i64.load16_u $push0=, $0{{$}}
+; CHECK: i64.load16_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @zext_i16_i64(i16 *%p) {
   %v = load i16, i16* %p
@@ -78,7 +78,7 @@ define i64 @zext_i16_i64(i16 *%p) {
 }
 
 ; CHECK-LABEL: sext_i32_i64:
-; CHECK: i64.load32_s $push0=, $0{{$}}
+; CHECK: i64.load32_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i32_i64(i32 *%p) {
   %v = load i32, i32* %p
@@ -87,7 +87,7 @@ define i64 @sext_i32_i64(i32 *%p) {
 }
 
 ; CHECK-LABEL: zext_i32_i64:
-; CHECK: i64.load32_u $push0=, $0{{$}}
+; CHECK: i64.load32_u $push0=, 0($0){{$}}
 ; CHECK: return $pop0{{$}}
 define i64 @zext_i32_i64(i32 *%p) {
   %v = load i32, i32* %p
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index 33d3aeecc582..37b514729479 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: load_u_i1_i32:
-; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM0]]{{$}}
 define i32 @load_u_i1_i32(i1* %p) {
   %v = load i1, i1* %p
@@ -15,8 +15,8 @@ define i32 @load_u_i1_i32(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i32:
-; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK:      i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM3]]{{$}}
@@ -27,7 +27,7 @@ define i32 @load_s_i1_i32(i1* %p) {
 }
 
 ; CHECK-LABEL: load_u_i1_i64:
-; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM0]]{{$}}
 define i64 @load_u_i1_i64(i1* %p) {
   %v = load i1, i1* %p
@@ -36,8 +36,8 @@ define i64 @load_u_i1_i64(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i64:
-; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK:      i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM3]]{{$}}
@@ -50,7 +50,7 @@ define i64 @load_s_i1_i64(i1* %p) {
 ; CHECK-LABEL: store_i32_i1:
 ; CHECK:      i32.const $push[[NUM0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i32.store8 $discard=, $0, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i32.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
 define void @store_i32_i1(i1* %p, i32 %v) {
   %t = trunc i32 %v to i1
   store i1 %t, i1* %p
@@ -60,7 +60,7 @@ define void @store_i32_i1(i1* %p, i32 %v) {
 ; CHECK-LABEL: store_i64_i1:
 ; CHECK:      i64.const $push[[NUM0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i64.store8 $discard=, $0, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i64.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
 define void @store_i64_i1(i1* %p, i64 %v) {
   %t = trunc i64 %v to i1
   store i1 %t, i1* %p
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index 1017167d5227..aa8ae689e0d1 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -8,7 +8,7 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK-LABEL: ldi32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32(i32 *%p) {
   %v = load i32, i32* %p
@@ -18,7 +18,7 @@ define i32 @ldi32(i32 *%p) {
 ; CHECK-LABEL: ldi64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64(i64 *%p) {
   %v = load i64, i64* %p
@@ -28,7 +28,7 @@ define i64 @ldi64(i64 *%p) {
 ; CHECK-LABEL: ldf32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @ldf32(float *%p) {
   %v = load float, float* %p
@@ -38,7 +38,7 @@ define float @ldf32(float *%p) {
 ; CHECK-LABEL: ldf64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @ldf64(double *%p) {
   %v = load double, double* %p
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
new file mode 100644
index 000000000000..19b110fcfa8a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that constant offsets can be folded into global addresses.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@x = external global [0 x i32]
+@y = global [50 x i32] zeroinitializer
+
+; Test basic constant offsets of both defined and external symbols.
+
+; CHECK-LABEL: test0:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test0() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 47)
+}
+
+; CHECK-LABEL: test1:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test1() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 47)
+}
+
+; Test zero offsets.
+
+; CHECK-LABEL: test2:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test2() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 0)
+}
+
+; CHECK-LABEL: test3:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test3() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 0)
+}
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
index abbc1c59af3c..bae8a7c9e3b8 100644
--- a/test/CodeGen/WebAssembly/phi.ll
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Test that phis are lowered.
 
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index f3000aab70ad..3c3434348361 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Test the register stackifier pass.
 
@@ -44,4 +44,45 @@ define i32 @yes1(i32* %q) {
   ret i32 %t
 }
 
+; Don't schedule stack uses into the stack. To reduce register pressure, the
+; scheduler might be tempted to move the definition of $2 down. However, this
+; would risk getting incorrect liveness if the instructions are later
+; rearranged to make the stack contiguous.
+
+; CHECK-LABEL: stack_uses:
+; CHECK-NEXT: .param i32, i32, i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32, i32{{$}}
+; CHECK-NEXT: i32.const   $4=, 1{{$}}
+; CHECK-NEXT: i32.const   $5=, 2{{$}}
+; CHECK-NEXT: i32.lt_s    $push0=, $0, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push1=, $1, $5{{$}}
+; CHECK-NEXT: i32.xor     $push4=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.lt_s    $push2=, $2, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push3=, $3, $5{{$}}
+; CHECK-NEXT: i32.xor     $push5=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.xor     $push6=, $pop4, $pop5{{$}}
+; CHECK-NEXT: i32.ne      $push7=, $pop6, $4{{$}}
+; CHECK-NEXT: block       BB4_2{{$}}
+; CHECK-NEXT: br_if       $pop7, BB4_2{{$}}
+; CHECK-NEXT: i32.const   $push8=, 0{{$}}
+; CHECK-NEXT: return      $pop8{{$}}
+; CHECK-NEXT: BB4_2:
+; CHECK-NEXT: return      $4{{$}}
+define i32 @stack_uses(i32 %x, i32 %y, i32 %z, i32 %w) {
+entry:
+  %c = icmp sle i32 %x, 0
+  %d = icmp sle i32 %y, 1
+  %e = icmp sle i32 %z, 0
+  %f = icmp sle i32 %w, 1
+  %g = xor i1 %c, %d
+  %h = xor i1 %e, %f
+  %i = xor i1 %g, %h
+  br i1 %i, label %true, label %false
+true:
+  ret i32 0
+false:
+  ret i32 1
+}
+
 !0 = !{}
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index d65e2a8bc3e5..9cfdc711a8a3 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -33,3 +33,17 @@ entry:
   %call = tail call i8* @memcpy(i8* %p, i8* %s, i32 %n)
   ret i8* %p
 }
+
+; Test that the optimization isn't performed on constant arguments.
+
+; CHECK-LABEL: test_constant_arg:
+; CHECK-NEXT: i32.const   $push0=, global{{$}}
+; CHECK-NEXT: call        $discard=, returns_arg, $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+@global = external global i32
+@addr = global i32* @global
+define void @test_constant_arg() {
+  %call = call i32* @returns_arg(i32* @global)
+  ret void
+}
+declare i32* @returns_arg(i32* returned)
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index 1bcee5d31fb7..84f24e6e5491 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -8,11 +8,54 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: single_block:
 ; CHECK-NOT: .local
-; CHECK: i32.const $push{{[0-9]+}}=, 0
-; CHECK: i32.store $push[[STORE:[0-9]+]]=, $0, $pop{{[0-9]+}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}{{$}}
 ; CHECK: return $pop[[STORE]]{{$}}
 define i32 @single_block(i32* %p) {
 entry:
   store i32 0, i32* %p
   ret i32 0
 }
+
+; Test interesting corner cases for wasm-store-results, in which the operand of
+; a store ends up getting used by a phi, which needs special handling in the
+; dominance test, since phis use their operands on their incoming edges.
+
+%class.Vec3 = type { float, float, float }
+
+@pos = global %class.Vec3 zeroinitializer, align 4
+
+; CHECK-LABEL: foo:
+; CHECK: i32.store $discard=, 0($pop0), $0{{$}}
+define void @foo() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  %conv6.i = sitofp i32 %i.0168.i to float
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = add nuw nsw i32 %i.0168.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 256
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}
+
+; CHECK-LABEL: bar:
+; CHECK: i32.store $discard=, 0($0), $pop0{{$}}
+define void @bar() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi float [ 0.0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = fadd float %i.0168.i, 1.0
+  %exitcond.i = fcmp oeq float %inc.i, 256.0
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
index e3587a5ff170..c12b716dfd59 100644
--- a/test/CodeGen/WebAssembly/store-trunc.ll
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: trunc_i8_i32:
-; CHECK: i32.store8 $discard=, $0, $1{{$}}
+; CHECK: i32.store8 $discard=, 0($0), $1{{$}}
 define void @trunc_i8_i32(i8 *%p, i32 %v) {
   %t = trunc i32 %v to i8
   store i8 %t, i8* %p
@@ -14,7 +14,7 @@ define void @trunc_i8_i32(i8 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: trunc_i16_i32:
-; CHECK: i32.store16 $discard=, $0, $1{{$}}
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
 define void @trunc_i16_i32(i16 *%p, i32 %v) {
   %t = trunc i32 %v to i16
   store i16 %t, i16* %p
@@ -22,7 +22,7 @@ define void @trunc_i16_i32(i16 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: trunc_i8_i64:
-; CHECK: i64.store8 $discard=, $0, $1{{$}}
+; CHECK: i64.store8 $discard=, 0($0), $1{{$}}
 define void @trunc_i8_i64(i8 *%p, i64 %v) {
   %t = trunc i64 %v to i8
   store i8 %t, i8* %p
@@ -30,7 +30,7 @@ define void @trunc_i8_i64(i8 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: trunc_i16_i64:
-; CHECK: i64.store16 $discard=, $0, $1{{$}}
+; CHECK: i64.store16 $discard=, 0($0), $1{{$}}
 define void @trunc_i16_i64(i16 *%p, i64 %v) {
   %t = trunc i64 %v to i16
   store i16 %t, i16* %p
@@ -38,7 +38,7 @@ define void @trunc_i16_i64(i16 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: trunc_i32_i64:
-; CHECK: i64.store32 $discard=, $0, $1{{$}}
+; CHECK: i64.store32 $discard=, 0($0), $1{{$}}
 define void @trunc_i32_i64(i32 *%p, i64 %v) {
   %t = trunc i64 %v to i32
   store i32 %t, i32* %p
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index a2164d4ae6a9..442caedef3a7 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -7,7 +7,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sti32:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: i32.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32(i32 *%p, i32 %v) {
   store i32 %v, i32* %p
@@ -16,7 +16,7 @@ define void @sti32(i32 *%p, i32 %v) {
 
 ; CHECK-LABEL: sti64:
 ; CHECK-NEXT: .param i32, i64{{$}}
-; CHECK-NEXT: i64.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: i64.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64(i64 *%p, i64 %v) {
   store i64 %v, i64* %p
@@ -25,7 +25,7 @@ define void @sti64(i64 *%p, i64 %v) {
 
 ; CHECK-LABEL: stf32:
 ; CHECK-NEXT: .param i32, f32{{$}}
-; CHECK-NEXT: f32.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: f32.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf32(float *%p, float %v) {
   store float %v, float* %p
@@ -34,7 +34,7 @@ define void @stf32(float *%p, float %v) {
 
 ; CHECK-LABEL: stf64:
 ; CHECK-NEXT: .param i32, f64{{$}}
-; CHECK-NEXT: f64.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: f64.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf64(double *%p, double %v) {
   store double %v, double* %p
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index 41c5b357d068..7f6f6efff7d6 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
 
-; Test switch instructions.
+; Test switch instructions. Block placement is disabled because it reorders
+; the blocks in a way that isn't interesting here.
 
 target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
@@ -20,7 +21,7 @@ declare void @foo5()
 ; CHECK: block BB0_4{{$}}
 ; CHECK: block BB0_3{{$}}
 ; CHECK: block BB0_2{{$}}
-; CHECK: tableswitch {{.*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
 ; CHECK: BB0_2:
 ; CHECK:   call foo0
 ; CHECK: BB0_3:
@@ -100,7 +101,7 @@ sw.epilog:                                        ; preds = %entry, %sw.bb.5, %s
 ; CHECK: block BB1_4{{$}}
 ; CHECK: block BB1_3{{$}}
 ; CHECK: block BB1_2{{$}}
-; CHECK: tableswitch {{.*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
 ; CHECK: BB1_2:
 ; CHECK:   call foo0
 ; CHECK: BB1_3:
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
index e14c58876ed2..414767e5c35d 100644
--- a/test/CodeGen/WebAssembly/unreachable.ll
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -verify-machineinstrs | FileCheck %s
 
 ; Test that LLVM unreachable instruction and trap intrinsic are lowered to
 ; wasm unreachable
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
new file mode 100644
index 000000000000..10846f2a989c
--- /dev/null
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+; Test varargs constructs.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; Test va_start.
+
+; TODO: Test va_start.
+
+;define void @start(i8** %ap, ...) {
+;entry:
+;  %0 = bitcast i8** %ap to i8*
+;  call void @llvm.va_start(i8* %0)
+;  ret void
+;}
+
+; Test va_end.
+
+; CHECK-LABEL: end:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: return{{$}}
+define void @end(i8** %ap) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  call void @llvm.va_end(i8* %0)
+  ret void
+}
+
+; Test va_copy.
+
+; CHECK-LABEL: copy:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.load  $push0=, 0($1){{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @copy(i8** %ap, i8** %bp) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  %1 = bitcast i8** %bp to i8*
+  call void @llvm.va_copy(i8* %0, i8* %1)
+  ret void
+}
+
+; Test va_arg with an i8 argument.
+
+; CHECK-LABEL: arg_i8:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $1=, 0($0){{$}}
+; CHECK-NEXT: i32.const  $push0=, 4{{$}}
+; CHECK-NEXT: i32.add    $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop1{{$}}
+; CHECK-NEXT: i32.load   $push2=, 0($1){{$}}
+; CHECK-NEXT: return     $pop2{{$}}
+define i8 @arg_i8(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i8
+  ret i8 %t
+}
+
+; Test va_arg with an i32 argument.
+
+; CHECK-LABEL: arg_i32:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $push0=, 0($0){{$}}
+; CHECK-NEXT: i32.const  $push1=, 3{{$}}
+; CHECK-NEXT: i32.add    $push2=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.const  $push3=, -4{{$}}
+; CHECK-NEXT: i32.and    $1=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.const  $push4=, 4{{$}}
+; CHECK-NEXT: i32.add    $push5=, $1, $pop4{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop5{{$}}
+; CHECK-NEXT: i32.load   $push6=, 0($1){{$}}
+; CHECK-NEXT: return     $pop6{{$}}
+define i32 @arg_i32(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i32
+  ret i32 %t
+}
+
+; Test va_arg with an i128 argument.
+
+; CHECK-LABEL: arg_i128:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .local
+; CHECK: i32.and
+; CHECK: i64.load
+; CHECK: i64.load
+; CHECK: return{{$}}
+define i128 @arg_i128(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i128
+  ret i128 %t
+}
+
+; Test a varargs call with no actual arguments.
+
+declare void @callee(...)
+
+; CHECK-LABEL: caller_none:
+; CHECK-NEXT: call callee{{$}}
+; CHECK-NEXT: return{{$}}
+define void @caller_none() {
+  call void (...) @callee()
+  ret void
+}
+
+; TODO: Test a varargs call with actual arguments.
+
+;define void @caller_some() {
+;  call void (...) @callee(i32 0, double 2.0)
+;  ret void
+;}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.va_copy(i8*, i8*)
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 678f1befad1d..7967d45c2ee8 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -16,7 +16,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 define i32 @bar(i32 %x) nounwind optsize ssp !dbg !8 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !DIExpression()), !dbg !DILocation(scope: !8)
-  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1, inlinedAt: !DILocation(scope: !8))
   %0 = tail call i32 (...) @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index a91fe7e0c523..e9e7d5aea273 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
 
 define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: andpd256:
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 2fdea2eb2562..37eacc87c3fb 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -57,19 +57,19 @@ entry:
 define <8 x float> @funcE() nounwind {
 ; CHECK-LABEL: funcE:
 ; CHECK:       ## BB#0: ## %for_exit499
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    andq $-32, %rsp
-; CHECK-NEXT:    subq $1312, %rsp ## imm = 0x520
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne LBB4_2
 ; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $1312, %rsp ## imm = 0x520
 ; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
-; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
@@ -145,10 +145,7 @@ define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
 define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
 ; CHECK-LABEL: splat_load_4f64_2222:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
 ; CHECK-NEXT:    retq
   %x = load <4 x double>, <4 x double>* %ptr
   %x1 = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -158,7 +155,7 @@ define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
 define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
 ; CHECK-LABEL: splat_load_4f32_0000:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
 ; CHECK-NEXT:    retq
   %x = load <4 x float>, <4 x float>* %ptr
   %x1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -168,10 +165,7 @@ define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
 define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) {
 ; CHECK-LABEL: splat_load_8f32_77777777:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %ymm0
 ; CHECK-NEXT:    retq
   %x = load <8 x float>, <8 x float>* %ptr
   %x1 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 8b8c11b85875..86b0628aa0bc 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1,7 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-; CHECK: vbroadcastsd (%
 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: A:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -11,8 +15,11 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: B:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -22,8 +29,11 @@ entry:
   ret <8 x i32> %vecinit6.i
 }
 
-; CHECK: vbroadcastsd (%
 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: C:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 8
   %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
@@ -33,8 +43,11 @@ entry:
   ret <4 x double> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: D:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
@@ -46,8 +59,11 @@ entry:
 
 ;;;; 128-bit versions
 
-; CHECK: vbroadcastss (%
 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: e:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -57,12 +73,14 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _e2
-; CHECK-NOT: vbroadcastss
-; CHECK: ret
+; Don't broadcast constants on pre-AVX2 hardware.
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
-    %vecinit.i = insertelement <4 x float> undef, float      0xbf80000000000000, i32 0
+; CHECK-LABEL: _e2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
+; CHECK-NEXT:    retq
+entry:
+   %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
@@ -70,8 +88,11 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 }
 
 
-; CHECK: vbroadcastss (%
 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: F:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -81,12 +102,158 @@ entry:
   ret <4 x i32> %vecinit6.i
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; Unsupported vbroadcasts
 
-; CHECK: _G
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: G:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
@@ -94,18 +261,21 @@ entry:
   ret <2 x i64> %vecinit2.i
 }
 
-; CHECK: _H
-; CHECK-NOT: broadcast
-; CHECK: ret
 define <4 x i32> @H(<4 x i32> %a) {
+; CHECK-LABEL: H:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    retq
+entry:
   %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x i32> %x
 }
 
-; CHECK: _I
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: I:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -113,10 +283,13 @@ entry:
   ret <2 x double> %vecinit2.i
 }
 
-; CHECK: _RR
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    movl %eax, (%rax)
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -129,11 +302,11 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _RR2
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %v = insertelement <4 x float> undef, float %q, i32 0
@@ -141,16 +314,15 @@ entry:
   ret <4 x float> %t
 }
 
-
 ; These tests check that a vbroadcast instruction is used when we have a splat
 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
 ; (via the insertelements).
 
-; CHECK-LABEL: splat_concat1
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat1(float* %p) {
+; CHECK-LABEL: splat_concat1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -160,11 +332,11 @@ define <8 x float> @splat_concat1(float* %p) {
   ret <8 x float> %6
 }
 
-; CHECK-LABEL: splat_concat2
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat2(float* %p) {
+; CHECK-LABEL: splat_concat2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -178,11 +350,11 @@ define <8 x float> @splat_concat2(float* %p) {
   ret <8 x float> %10
 }
 
-; CHECK-LABEL: splat_concat3
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat3(double* %p) {
+; CHECK-LABEL: splat_concat3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -190,11 +362,11 @@ define <4 x double> @splat_concat3(double* %p) {
   ret <4 x double> %4
 }
 
-; CHECK-LABEL: splat_concat4
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat4(double* %p) {
+; CHECK-LABEL: splat_concat4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -203,4 +375,3 @@ define <4 x double> @splat_concat4(double* %p) {
   %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x double> %6
 }
-
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 418707cdc237..6b77edb155a4 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -172,6 +172,207 @@ entry:
   ret <4 x i64> %q3
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <32 x i8>, <32 x i8>* %ptr
+  %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i16_8i16_11111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i16>, <16 x i16>* %ptr
+  %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; make sure that we still don't support broadcast double into 128-bit vector
 ; this used to crash
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
@@ -242,13 +443,13 @@ define void @crash() nounwind alwaysinline {
 ; CHECK:       ## BB#0: ## %WGLoopsEntry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je LBB13_1
+; CHECK-NEXT:    je LBB31_1
 ; CHECK-NEXT:  ## BB#2: ## %ret
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .align 4, 0x90
-; CHECK-NEXT:  LBB13_1: ## %footer349VF
+; CHECK-NEXT:  LBB31_1: ## %footer349VF
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp LBB13_1
+; CHECK-NEXT:    jmp LBB31_1
 WGLoopsEntry:
   br i1 undef, label %ret, label %footer329VF
 
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index d7da77a5eb54..9220e4f269cd 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s
@@ -823,3 +824,73 @@ define <16 x float>  @test_fxor(<16 x float> %a) {
   ret <16 x float>%res
 }
 
+define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
+; CHECK-LABEL: test_fxor_8f32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <8 x float>%res
+}
+
+define <8 x double> @fabs_v8f64(<8 x double> %p)
+; AVX512F-LABEL: fabs_v8f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v8f64:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v8f64:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v8f64:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+  ret <8 x double> %t
+}
+declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+
+define <16 x float> @fabs_v16f32(<16 x float> %p)
+; AVX512F-LABEL: fabs_v16f32:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v16f32:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v16f32:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v16f32:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+  ret <16 x float> %t
+}
+declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index c47027eed2b2..4a4032570e7c 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -65,9 +65,9 @@ declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
 ; CHECK-LABEL: unpckbw_test:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
@@ -6160,77 +6160,171 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
 }
 
 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae
-; CHECK: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete	%al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae
-; CHECK: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_eq
-; CHECK: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq
-; CHECK: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae
-; CHECK: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae
-; CHECK: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_lt
-; CHECK: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt
-; CHECK: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
   ret i32 %res
 }
 
-declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) 
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 
 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt
-; CHECK: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
   ret i32 %res
 }
 
 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  ret <2 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 6b032e0e6d78..c6ba0dd6eb42 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
-
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: test_pcmpeq_b:
@@ -8,6 +8,18 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp0:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
@@ -19,6 +31,21 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp1:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -31,6 +58,12 @@ define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
@@ -42,6 +75,13 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -54,6 +94,18 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp2:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
@@ -65,6 +117,21 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp3:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -77,6 +144,12 @@ define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
@@ -88,6 +161,13 @@ define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -121,6 +201,46 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp4:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -167,6 +287,49 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp5:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -214,6 +377,46 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp6:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -260,6 +463,49 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp7:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -307,6 +553,33 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -353,6 +626,34 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -400,6 +701,33 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -446,6 +774,34 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -474,6 +830,12 @@ define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i1
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
     %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
@@ -485,6 +847,14 @@ define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
   ret <64 x i8> %res
 }
@@ -494,6 +864,11 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -505,6 +880,13 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -515,6 +897,12 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -524,6 +912,12 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -536,6 +930,14 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -547,6 +949,13 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -557,6 +966,12 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -571,6 +986,14 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -584,6 +1007,13 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -598,6 +1028,11 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
@@ -609,6 +1044,15 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
@@ -619,6 +1063,14 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
@@ -628,6 +1080,12 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
@@ -640,6 +1098,16 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
@@ -651,6 +1119,15 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -664,6 +1141,11 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -675,6 +1157,13 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -685,6 +1174,12 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -694,6 +1189,12 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -706,6 +1207,14 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -717,6 +1226,13 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -727,6 +1243,12 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -741,6 +1263,14 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -754,6 +1284,13 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -768,6 +1305,11 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
@@ -779,6 +1321,15 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
@@ -789,6 +1340,14 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
@@ -798,6 +1357,12 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
@@ -810,6 +1375,16 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
@@ -821,6 +1396,15 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -833,6 +1417,11 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -844,6 +1433,13 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -854,6 +1450,12 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -863,6 +1465,12 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -875,6 +1483,14 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -886,6 +1502,13 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -898,6 +1521,11 @@ define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -909,6 +1537,13 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -919,6 +1554,12 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -928,6 +1569,12 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -940,6 +1587,14 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -951,6 +1606,13 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -963,6 +1625,11 @@ define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -974,6 +1641,13 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -984,6 +1658,12 @@ define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -993,6 +1673,12 @@ define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -1005,6 +1691,14 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -1016,6 +1710,13 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -1028,6 +1729,11 @@ define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -1039,6 +1745,13 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -1049,6 +1762,12 @@ define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -1058,6 +1777,12 @@ define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -1070,6 +1795,14 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -1081,6 +1814,13 @@ define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -1098,6 +1838,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1114,6 +1864,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1130,6 +1888,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1146,6 +1914,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1162,6 +1938,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1178,6 +1964,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1194,6 +1988,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1210,6 +2014,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1227,6 +2039,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1244,6 +2065,15 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1261,6 +2091,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1277,6 +2116,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x
 ; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1293,6 +2142,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
 ; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1309,6 +2166,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1325,6 +2192,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 ; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1341,6 +2216,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 ; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1357,6 +2242,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 ; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1373,6 +2266,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1389,6 +2290,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
 ; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1407,6 +2316,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1424,6 +2343,14 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax)
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1441,6 +2368,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1458,6 +2395,15 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1475,6 +2421,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1492,6 +2448,15 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1507,6 +2472,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
 ; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1523,6 +2496,14 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1539,6 +2520,16 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1555,6 +2546,16 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1571,6 +2572,14 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1587,6 +2596,14 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1605,6 +2622,18 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
@@ -1625,6 +2654,16 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
@@ -1642,6 +2681,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
 ; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1657,6 +2703,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
 ; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1671,8 +2724,65 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   %res2 = add  <8 x i64> %res, %res1
   ret  <8 x i64> %res2
 }
+
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp8:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}
diff --git a/test/CodeGen/X86/branchfolding-catchpads.ll b/test/CodeGen/X86/branchfolding-catchpads.ll
new file mode 100644
index 000000000000..21c7818e5195
--- /dev/null
+++ b/test/CodeGen/X86/branchfolding-catchpads.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @throw()
+declare i16 @f()
+
+define i16 @test1(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  catchpad [i8* null, i32 8, i8* null]
+          to label %catch unwind label %catch.dispatch.2
+
+catch:
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catch.dispatch.2:
+  catchpad [i8* null, i32 64, i8* null]
+          to label %catch.2 unwind label %catchendblock
+
+catch.2:
+  store i8 1, i8* %b
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catchendblock:
+  catchendpad unwind to caller
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where two funclet blocks meet the old criteria
+; to be placed at the end.  The order of the blocks is not important for the
+; purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test1;
+
+define i16 @test2(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  catchpad [i8* null, i32 8, i8* null]
+          to label %catch unwind label %catch.dispatch.2
+
+catch:
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catch.dispatch.2:
+  %c2 = catchpad [i8* null, i32 32, i8* null]
+          to label %catch.2 unwind label %catch.dispatch.3
+
+catch.2:
+  store i8 1, i8* %b
+  catchret %c2 to label %cleanup
+
+catch.dispatch.3:
+  %c3 = catchpad [i8* null, i32 64, i8* null]
+          to label %catch.3 unwind label %catchendblock
+
+catch.3:
+  store i8 2, i8* %b
+  catchret %c3 to label %cleanup
+
+catchendblock:
+  catchendpad unwind to caller
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ], [ -1, %catch.2 ], [ -1, %catch.3 ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where three funclet blocks all meet the old
+; criteria to be placed at the end.  The order of the blocks is not important
+; for the purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test2;
+
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index c129128b5fa7..c294dee40135 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,7 +1,18 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
+; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
+
+; FIXME: X86InstrInfo::copyPhysReg had code which figured out whether AX was
+;        live or not to avoid save / restore when it's not needed. See FIXME in
+;        that function for more details on which the code is currently
+;        disabled. The extra push/pop are marked below and can be removed once
+;        the issue is fixed.
+;        -verify-machineinstrs should also be added back in the RUN lines above.
 
 declare i32 @foo()
 declare i32 @bar(i64)
@@ -17,38 +28,64 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; i386-NEXT: movl %edx, 4(%esp)
 ; i386-NEXT: movl %eax, (%esp)
 ; i386-NEXT: calll bar
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: pushl %eax
 ; i386-NEXT: movl [[FLAGS]], %eax
 ; i386-NEXT: addb $127, %al
 ; i386-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: popl %eax
 ; i386-NEXT: jne
 
 ; i386f-LABEL: test_intervening_call:
 ; i386f: cmpxchg8b
 ; i386f-NEXT: movl %eax, (%esp)
 ; i386f-NEXT: movl %edx, 4(%esp)
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: seto %al
 ; i386f-NEXT: lahf
 ; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: calll bar
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: movl [[FLAGS]], %eax
 ; i386f-NEXT: addb $127, %al
 ; i386f-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: jne
 
 ; x8664-LABEL: test_intervening_call:
 ; x8664: cmpxchgq
-; x8664: pushq %rax
-; x8664-NEXT: seto %al
-; x8664-NEXT: lahf
-; x8664-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-NEXT: popq %rax
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
 ; x8664-NEXT: movq %rax, %rdi
 ; x8664-NEXT: callq bar
-; x8664-NEXT: movq [[FLAGS]], %rax
-; x8664-NEXT: addb $127, %al
-; x8664-NEXT: sahf
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
 ; x8664-NEXT: jne
 
+; x8664-sahf-LABEL: test_intervening_call:
+; x8664-sahf: cmpxchgq
+; x8664-sahf: pushq %rax
+; x8664-sahf-NEXT: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: movq %rax, %rdi
+; x8664-sahf-NEXT: callq bar
+; ** FIXME Next line isn't actually necessary. **
+; x8664-sahf-NEXT: pushq %rax
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: jne
+
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
   %v = extractvalue { i64, i1 } %cx, 0
   %p = extractvalue { i64, i1 } %cx, 1
@@ -76,6 +113,10 @@ define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
 ; x8664: cmpxchg
 ; x8664-NEXT: jne
 
+; x8664-sahf-LABEL: test_control_flow:
+; x8664-sahf: cmpxchg
+; x8664-sahf-NEXT: jne
+
 entry:
   %cmp = icmp sgt i32 %i, %j
   br i1 %cmp, label %loop_start, label %cond.end
@@ -111,9 +152,13 @@ cond.end:
 define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 ; i386-LABEL: test_feed_cmov:
 ; i386: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: pushl %eax
 ; i386-NEXT: seto %al
 ; i386-NEXT: lahf
 ; i386-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: popl %eax
 ; i386-NEXT: calll foo
 ; i386-NEXT: pushl %eax
 ; i386-NEXT: movl [[FLAGS]], %eax
@@ -123,9 +168,13 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 
 ; i386f-LABEL: test_feed_cmov:
 ; i386f: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: seto %al
 ; i386f-NEXT: lahf
 ; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: calll foo
 ; i386f-NEXT: pushl %eax
 ; i386f-NEXT: movl [[FLAGS]], %eax
@@ -134,16 +183,28 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 ; i386f-NEXT: popl %eax
 
 ; x8664-LABEL: test_feed_cmov:
-; x8664: cmpxchgl
-; x8664: seto %al
-; x8664-NEXT: lahf
-; x8664-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664: cmpxchg
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
 ; x8664-NEXT: callq foo
-; x8664-NEXT: pushq %rax
-; x8664-NEXT: movq [[FLAGS]], %rax
-; x8664-NEXT: addb $127, %al
-; x8664-NEXT: sahf
-; x8664-NEXT: popq %rax
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
+
+; x8664-sahf-LABEL: test_feed_cmov:
+; x8664-sahf: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; x8664-sahf: pushq %rax
+; x8664-sahf: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: callq foo
+; x8664-sahf-NEXT: pushq %rax
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: popq %rax
 
   %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
index 537d6298ddf4..6c577a2cfcc7 100644
--- a/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -5,13 +5,13 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 define void @test_copysign_const_magnitude_d(double %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .quad -9223372036854775808    ## double -0.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad -9223372036854775808    ## double -0
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .quad 4607182418800017408     ## double 1.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad 4607182418800017408     ## double 1
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK-LABEL: test_copysign_const_magnitude_d:
 
 ; CHECK: id
@@ -50,17 +50,17 @@ define void @test_copysign_const_magnitude_d(double %X) {
 
 define void @test_copysign_const_magnitude_f(float %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .long	2147483648              ## float -0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	2147483648              ## float -0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .long	1065353216              ## float 1.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	1065353216              ## float 1
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK-LABEL: test_copysign_const_magnitude_f:
 
 ; CHECK: id
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
new file mode 100644
index 000000000000..c229521cc9a4
--- /dev/null
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare void @_ZN1SC1Ev(%struct.S*)
+declare void @_ZN1SD1Ev(%struct.S*)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+; Every GPR should be saved - except rdi, rax, and rsp
+; CHECK-LABEL: _ZTW2sg
+; CHECK: pushq %r11
+; CHECK: pushq %r10
+; CHECK: pushq %r9
+; CHECK: pushq %r8
+; CHECK: pushq %rsi
+; CHECK: pushq %rdx
+; CHECK: pushq %rcx
+; CHECK: pushq %rbx
+; CHECK: callq
+; CHECK: jne
+; CHECK: callq
+; CHECK: tlv_atexit
+; CHECK: callq
+; CHECK: popq %rbx
+; CHECK: popq %rcx
+; CHECK: popq %rdx
+; CHECK: popq %rsi
+; CHECK: popq %r8
+; CHECK: popq %r9
+; CHECK: popq %r10
+; CHECK: popq %r11
+; SHRINK-LABEL: _ZTW2sg
+; SHRINK: callq
+; SHRINK: jne
+; SHRINK: pushq %r11
+; SHRINK: pushq %r10
+; SHRINK: pushq %r9
+; SHRINK: pushq %r8
+; SHRINK: pushq %rsi
+; SHRINK: pushq %rdx
+; SHRINK: pushq %rcx
+; SHRINK: pushq %rbx
+; SHRINK: callq
+; SHRINK: tlv_atexit
+; SHRINK: popq %rbx
+; SHRINK: popq %rcx
+; SHRINK: popq %rdx
+; SHRINK: popq %rsi
+; SHRINK: popq %r8
+; SHRINK: popq %r9
+; SHRINK: popq %r10
+; SHRINK: popq %r11
+; SHRINK: LBB{{.*}}:
+; SHRINK: callq
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  tail call void @_ZN1SC1Ev(%struct.S* nonnull @sg) #2
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (void (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) #2
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
diff --git a/test/CodeGen/X86/extractelement-legalization-cycle.ll b/test/CodeGen/X86/extractelement-legalization-cycle.ll
new file mode 100644
index 000000000000..d75f03ba1680
--- /dev/null
+++ b/test/CodeGen/X86/extractelement-legalization-cycle.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; When the extractelement is converted to a load the store can be re-used.
+; This will, however, introduce a cycle into the selection DAG (the load
+; of the extractelement index is dependent on the store, and so after the
+; conversion it becomes dependent on the new load, which is dependent on
+; the index).  Make sure we skip the store, and conservatively instead
+; use a store to the stack.
+
+define float @foo(i32* %i, <4 x float>* %v) {
+; CHECK-LABEL: foo:
+; CHECK:    movaps %xmm0, -[[OFFSET:[0-9]+]](%rsp)
+; CHECK:    movss -[[OFFSET]](%rsp,{{.*}}), %xmm0 {{.*}}
+; CHECK-NEXT:    retq
+  %1 = load <4 x float>, <4 x float>* %v, align 16
+  %mul = fmul <4 x float> %1, %1
+  store <4 x float> %mul, <4 x float>* %v, align 16
+  %2 = load i32, i32* %i, align 4
+  %vecext = extractelement <4 x float> %mul, i32 %2
+  ret float %vecext
+}
diff --git a/test/CodeGen/X86/fadd-combines.ll b/test/CodeGen/X86/fadd-combines.ll
index 6b389f4099c2..2df0e06dc252 100644
--- a/test/CodeGen/X86/fadd-combines.ll
+++ b/test/CodeGen/X86/fadd-combines.ll
@@ -28,10 +28,10 @@ define float @fadd_2const_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
 define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_2const_4f32:
 ; CHECK:       # BB#0:
@@ -53,10 +53,10 @@ define float @fadd_x_fmul_x_c_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 2.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
 define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_x_fmul_x_c_4f32:
 ; CHECK:       # BB#0:
@@ -78,10 +78,10 @@ define float @fadd_fmul_x_c_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 2.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
 define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fmul_x_c_x_4f32:
 ; CHECK:       # BB#0:
@@ -104,10 +104,10 @@ define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 {
   ret float %w
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 6.000000e+00
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
 define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_4f32:
 ; CHECK:       # BB#0:
@@ -131,10 +131,10 @@ define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 {
   ret float %w
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 6.000000e+00
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
 define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -157,10 +157,10 @@ define float @fadd_x_fadd_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
 define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_x_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -182,10 +182,10 @@ define float @fadd_fadd_x_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
 define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -207,10 +207,10 @@ define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
 define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
index 5de9700fc064..0108430ee93e 100644
--- a/test/CodeGen/X86/float-asmprint.ll
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -9,6 +9,8 @@
 @var64 = global double -0.0, align 8
 @var32 = global float -0.0, align 4
 @var16 = global half -0.0, align 2
+@var4f32 = global <4 x float> <float -0.0, float 0.0, float 1.0, float 2.0>
+@var4f16 = global <4 x half> <half -0.0, half 0.0, half 1.0, half 2.0>
 
 ; CHECK: var128:
 ; CHECK-NEXT: .quad 0                         # fp128 -0
@@ -39,3 +41,16 @@
 ; CHECK-NEXT: .short 32768                    # half -0
 ; CHECK-NEXT: .size
 
+; CHECK: var4f32:
+; CHECK-NEXT: .long 2147483648               # float -0
+; CHECK-NEXT: .long 0                        # float 0
+; CHECK-NEXT: .long 1065353216               # float 1
+; CHECK-NEXT: .long 1073741824               # float 2
+; CHECK-NEXT: .size
+
+; CHECK: var4f16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .short 0                        # half 0
+; CHECK-NEXT: .short 15360                    # half 1
+; CHECK-NEXT: .short 16384                    # half 2
+; CHECK-NEXT: .size
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
index 9a368792133b..162a97ac025c 100644
--- a/test/CodeGen/X86/fma-commute-x86.ll
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -8,8 +8,8 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float
 define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -102,8 +102,8 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do
 define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -113,7 +113,7 @@ define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -123,7 +123,7 @@ define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -197,8 +197,8 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa
 define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -291,8 +291,8 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d
 define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -302,7 +302,7 @@ define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -312,7 +312,7 @@ define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -386,8 +386,8 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float
 define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -480,8 +480,8 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do
 define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -491,7 +491,7 @@ define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -501,7 +501,7 @@ define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -575,8 +575,8 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa
 define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -669,8 +669,8 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
 define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -680,7 +680,7 @@ define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -690,7 +690,7 @@ define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 5a97579b54f4..cf4c8933fcab 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -10,8 +10,8 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfmadd_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132ss     (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
@@ -27,8 +27,8 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
@@ -45,8 +45,8 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmadd_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
@@ -62,12 +62,12 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -171,8 +171,8 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
@@ -190,8 +190,8 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmsub_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
@@ -207,12 +207,12 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -299,8 +299,8 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
@@ -316,8 +316,8 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
@@ -335,8 +335,8 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
@@ -352,12 +352,12 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -444,8 +444,8 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
@@ -461,8 +461,8 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
@@ -480,8 +480,8 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
@@ -497,12 +497,12 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
diff --git a/test/CodeGen/X86/fma-scalar-memfold.ll b/test/CodeGen/X86/fma-scalar-memfold.ll
index b6126ffe4fd6..0ceaa562a5d4 100644
--- a/test/CodeGen/X86/fma-scalar-memfold.ll
+++ b/test/CodeGen/X86/fma-scalar-memfold.ll
@@ -216,7 +216,7 @@ define void @fmadd_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmadd_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmadd213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -237,7 +237,7 @@ define void @fmadd_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmadd_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmadd132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -258,7 +258,7 @@ define void @fmsub_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmsub_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmsub213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -279,7 +279,7 @@ define void @fmsub_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmsub_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmsub132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -300,7 +300,7 @@ define void @fnmadd_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmadd_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmadd213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -321,7 +321,7 @@ define void @fnmadd_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmadd_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmadd132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -342,7 +342,7 @@ define void @fnmsub_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmsub_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmsub213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -363,7 +363,7 @@ define void @fnmsub_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmsub_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmsub132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 0f0dd20da040..76a4acf00f90 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1114,13 +1114,13 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
 define double @test_f64_fneg_fmul(double %x, double %y) #0 {
 ; FMA-LABEL: test_f64_fneg_fmul:
 ; FMA:       # BB#0:
-; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: test_f64_fneg_fmul:
 ; FMA4:       # BB#0:
-; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
index 23678c46dba0..19041c9ff677 100644
--- a/test/CodeGen/X86/fmaxnum.ll
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu  < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 
 declare float @fmaxf(float, float)
 declare double @fmax(double, double)
@@ -7,44 +8,200 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
 
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
+
+
 ; CHECK-LABEL: @test_fmaxf
-; CHECK: calll fmaxf
+; CHECK: jmp fmaxf
 define float @test_fmaxf(float %x, float %y) {
   %z = call float @fmaxf(float %x, float %y) readnone
   ret float %z
 }
 
 ; CHECK-LABEL: @test_fmax
-; CHECK: calll fmax
+; CHECK: jmp fmax
 define double @test_fmax(double %x, double %y) {
   %z = call double @fmax(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxf
-; CHECK: calll fmaxf
+; CHECK: jmp fmaxf
 define float @test_intrinsic_fmaxf(float %x, float %y) {
   %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
   ret float %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax
-; CHECK: calll fmax
+; CHECK: jmp fmax
 define double @test_intrinsic_fmax(double %x, double %y) {
   %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f32
+; SSE:         movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE:         movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE:         movaps %xmm1, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX:         vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    callq fmaxf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vmovshdup {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilpd $1, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilps $231, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    addq $56, %rsp
+; AVX-NEXT:    retq
+define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v4f32
+; SSE:         movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE:         movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE:         movaps %xmm1, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX:         vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    callq fmaxf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vmovshdup {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilpd $1, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilps $231, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    addq $56, %rsp
+; AVX-NEXT:    retq
+define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f64
+; CHECK: callq fmax
+; CHECK: callq fmax
+define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
+  %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+  ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v4f64
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v8f64
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
+  %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+  ret <8 x double> %z
+}
+
diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
index 1e33cf4696af..e89ed32ad618 100644
--- a/test/CodeGen/X86/fminnum.ll
+++ b/test/CodeGen/X86/fminnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s
 
 declare float @fminf(float, float)
 declare double @fmin(double, double)
@@ -10,6 +11,7 @@ declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
 
 ; CHECK-LABEL: @test_fminf
@@ -27,7 +29,7 @@ define double @test_fmin(double %x, double %y) {
 }
 
 ; CHECK-LABEL: @test_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
@@ -48,47 +50,57 @@ define double @test_intrinsic_fmin(double %x, double %y) {
 }
 
 ; CHECK-LABEL: @test_intrinsic_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f32
-; CHECK: calll fminf
-; CHECK: calll fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
 define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
   %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
   ret <2 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v4f32
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
 define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
   %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
   ret <4 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f64
-; CHECK: calll fmin
-; CHECK: calll fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
 define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
   %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
   ret <2 x double> %z
 }
 
+; CHECK-LABEL: @test_intrinsic_fmin_v4f64
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+define <4 x double> @test_intrinsic_fmin_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
 ; CHECK-LABEL: @test_intrinsic_fmin_v8f64
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
 define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
   %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
   ret <8 x double> %z
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 42e538646ffe..564ce42fdb75 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -56,10 +56,10 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
 }
 
 ; We should be able to pre-multiply the two constant vectors.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -71,10 +71,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
 }
 
 ; Same as above, but reverse operands to make sure non-canonical form is also handled.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -87,10 +87,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x)
 
 ; More than one use of a constant multiply should not inhibit the optimization.
 ; Instead of a chain of 2 dependent mults, this test will have 2 independent mults.
-; CHECK: float 6.000000e+00
-; CHECK: float 1.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 3.600000e+01
+; CHECK: float 6
+; CHECK: float 14
+; CHECK: float 24
+; CHECK: float 36
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
 ; CHECK: mulps
 ; CHECK: ret
@@ -110,10 +110,10 @@ define <4 x float> @PR22698_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
 ; CHECK-LABEL: PR22698_splats:
 ; CHECK: mulps
 ; CHECK: ret
@@ -126,10 +126,10 @@ define <4 x float> @PR22698_no_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 4.500000e+01
-; CHECK: float 1.200000e+02
-; CHECK: float 2.310000e+02
-; CHECK: float 3.840000e+02
+; CHECK: float 45
+; CHECK: float 120
+; CHECK: float 231
+; CHECK: float 384
 ; CHECK-LABEL: PR22698_no_splats:
 ; CHECK: mulps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/function-alias.ll b/test/CodeGen/X86/function-alias.ll
new file mode 100644
index 000000000000..d68d75d5578a
--- /dev/null
+++ b/test/CodeGen/X86/function-alias.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; "data" constant
+@0 = private constant <{ i8, i8 }> <{i8 15, i8 11}>, section ".text"
+
+; function-typed alias
+@ud2 = alias void (), bitcast (<{ i8, i8 }>* @0 to void ()*)
+
+; Check that "ud2" is emitted as a function symbol.
+; CHECK: .type{{.*}}ud2,@function
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index e5f6ea70e9cb..4f7e4092a99c 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -14,8 +14,9 @@ define void @g() {
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
 ; CHECK-NOT: %esp
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
index 904366219ab7..e523c945a69f 100644
--- a/test/CodeGen/X86/inalloca.ll
+++ b/test/CodeGen/X86/inalloca.ll
@@ -14,8 +14,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f
   ret void
@@ -33,8 +34,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
 ; CHECK: movl    $1, %eax
 ; CHECK: calll   _inreg_with_inalloca
@@ -53,8 +55,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK-DAG: movl    $13, (%esp)
-; CHECK-DAG: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK-DAG: movl    $13, (%eax)
+; CHECK-DAG: movl    $42, 4(%eax)
   call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
 ; CHECK-DAG: xorl    %ecx, %ecx
 ; CHECK: calll   _thiscall_with_inalloca
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
new file mode 100644
index 000000000000..c105b31995b3
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, [32 x i32], i32 }
+
+@arr1 = external global [65 x %struct.anon1], align 16
+@arr2 = external global [65 x %struct.anon2], align 16
+
+define void @test1(i64 %x) nounwind {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test1:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	movl arr1(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	subl arr1+4(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl arr1+8(,[[REG1]],4), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+define void @test2(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test2:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	movl -4([[REG2]]), {{.*}}
+; CHECK:	subl ([[REG2]]), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+; Check that LEA optimization pass takes into account a resultant address
+; displacement when choosing a LEA instruction for replacing a redundant
+; address recalculation.
+
+define void @test3(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 2
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 0
+  %tmp1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %a, align 4
+  store i32 222, i32* %b, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %a, align 4
+  store i32 444, i32* %b, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test3:
+; CHECK:	imulq {{.*}}, [[REG1:%[a-z]+]]
+; CHECK:	leaq arr2+132([[REG1]]), [[REG2:%[a-z]+]]
+; CHECK:	leaq arr2([[REG1]]), [[REG3:%[a-z]+]]
+
+; REG3's definition is closer to movl than REG2's, but the pass still chooses
+; REG2 because it provides the resultant address displacement fitting 1 byte.
+
+; CHECK:	movl ([[REG2]]), {{.*}}
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index a720054c167c..1a9cf008e869 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -139,18 +139,55 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
   ret <4 x double> %res
 }
 
-; AVX2-LABEL: test11
+; AVX2-LABEL: test11a
 ; AVX2: vmaskmovps
 ; AVX2: vblendvps
 
-; SKX-LABEL: test11
-; SKX: vmovaps {{.*}}{%k1}
-define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; SKX-LABEL: test11a
+; SKX: vmovaps (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11a
+; AVX512: kshiftlw $8
+; AVX512: kshiftrw $8
+; AVX512: vmovups (%rdi), %zmm1 {%k1}
+define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
   ret <8 x float> %res
 }
 
+; SKX-LABEL: test11b
+; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11b
+; AVX512: kshiftlw        $8
+; AVX512: kshiftrw        $8
+; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
+define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+  ret <8 x i32> %res
+}
+
+; SKX-LABEL: test11c
+; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11c
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
+define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
+  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+; SKX-LABEL: test11d
+; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11d
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+  ret <8 x i32> %res
+}
+
 ; AVX2-LABEL: test12
 ; AVX2: vpmaskmovd %ymm
 
@@ -291,6 +328,7 @@ declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i
 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
new file mode 100644
index 000000000000..a8df33454e92
--- /dev/null
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -0,0 +1,192 @@
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+
+; FIXME Add -verify-machineinstrs back when PR24535 is fixed.
+
+; The peephole optimizer can elide some physical register copies such as
+; EFLAGS. Make sure the flags are used directly, instead of needlessly using
+; lahf, when possible.
+
+@L = external global i32
+@M = external global i8
+declare i32 @bar(i64)
+
+; CHECK-LABEL: plus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: incl L
+define i1 @plus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: plus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $42,
+define i1 @plus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: decl L
+define i1 @minus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $-42,
+define i1 @minus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: test_intervening_call:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; CHECK:       call{{[lq]}} bar
+; CHECK:       addb $127, %al
+; CHECK-NEXT:  sahf
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+  ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %v = extractvalue { i64, i1 } %cx, 0
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar(i64 %v)
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: test_two_live_flags:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; Save result of the first cmpxchg into D.
+; CHECK-NEXT:  mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; CHECK:       cmpxchg
+; CHECK-NEXT:  sete %al
+; Save result of the second cmpxchg onto the stack.
+; CHECK-NEXT:  push{{[lq]}} %[[AX]]
+; Restore result of the first cmpxchg from D, put it back in EFLAGS.
+; CHECK-NEXT:  mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT:  addb $127, %al
+; CHECK-NEXT:  sahf
+; Restore result of the second cmpxchg from the stack.
+; CHECK-NEXT:  pop{{[lq]}} %[[AX]]
+; Test from EFLAGS restored from first cmpxchg, jump if that fails.
+; CHECK-NEXT:  jne
+; Fallthrough to test the second cmpxchg's result.
+; CHECK:       testb %al, %al
+; CHECK-NEXT:  je
+define i64 @test_two_live_flags(
+       i64* %foo0, i64 %bar0, i64 %baz0,
+       i64* %foo1, i64 %bar1, i64 %baz1) {
+  %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
+  %p0 = extractvalue { i64, i1 } %cx0, 1
+  %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
+  %p1 = extractvalue { i64, i1 } %cx1, 1
+  %flag = and i1 %p0, %p1
+  br i1 %flag, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: asm_clobbering_flags:
+; CHECK:       test
+; CHECK-NEXT:  setg
+; CHECK-NEXT:  #APP
+; CHECK-NEXT:  bsfl
+; CHECK-NEXT:  #NO_APP
+; CHECK-NEXT:  movl
+; CHECK-NEXT:  ret
+define i1 @asm_clobbering_flags(i32* %mem) {
+  %val = load i32, i32* %mem, align 4
+  %cmp = icmp sgt i32 %val, 0
+  %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
+  store i32 %res, i32* %mem, align 4
+  ret i1 %cmp
+}
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 4d07a1d8181b..6389708f42cc 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -6,17 +6,24 @@ declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @large(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f)
 declare void @empty()
 
-; When we use an invoke, and have FP, we expect a .cfi_escape GNU_ARGS_SIZE
-; with size 16 before the invocation. Without FP, we expect.cfi_adjust_cfa_offset
-; before and after.
-; Darwin should not generate pushes in neither circumstance.
+; When we use an invoke, we expect a .cfi_escape GNU_ARGS_SIZE
+; with size 16 before the invocation. Without FP, we also expect
+; .cfi_adjust_cfa_offset after each push.
+; Darwin should not generate pushes in either circumstance.
 ; CHECK-LABEL: test1_nofp:
 ; LINUX: .cfi_escape 0x2e, 0x10
-; LINUX: .cfi_adjust_cfa_offset 16
 ; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
@@ -62,11 +69,18 @@ cleanup:
 ; so darwin should not generate pushes.
 ; CHECK-LABEL: test2_nofp:
 ; LINUX-NOT: .cfi_escape
-; LINUX: .cfi_adjust_cfa_offset 16
-; LINUX-NEXT: pushl   $4
+; LINUX: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
@@ -170,11 +184,18 @@ cleanup:
 ; without parameters, but don't need to adjust the cfa offset
 ; CHECK-LABEL: test5_nofp:
 ; LINUX: .cfi_escape 0x2e, 0x10
-; LINUX: .cfi_adjust_cfa_offset 16
 ; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
diff --git a/test/CodeGen/X86/shrink-wrap-chkstk.ll b/test/CodeGen/X86/shrink-wrap-chkstk.ll
new file mode 100644
index 000000000000..c0b2b45e676f
--- /dev/null
+++ b/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -enable-shrink-wrap=true | FileCheck %s
+
+; chkstk cannot come before the usual prologue, since it adjusts ESP.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { [12 x i8] }
+
+define x86_thiscallcc void @call_inalloca(i1 %x) {
+entry:
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %argidx1 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 0
+  %argidx2 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 1
+  store i8 42, i8* %argidx2, align 4
+  br i1 %x, label %bb1, label %bb2
+
+bb1:
+  store i8 42, i8* %argidx1, align 4
+  br label %bb2
+
+bb2:
+  call void @inalloca_params(<{ %struct.S }>* inalloca nonnull %argmem)
+  ret void
+}
+
+; CHECK-LABEL: _call_inalloca: # @call_inalloca
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl $12, %eax
+; CHECK: calll __chkstk
+; CHECK: calll _inalloca_params
+; CHECK: movl %ebp, %esp
+; CHECK: popl %ebp
+; CHECK: retl
+
+declare void @inalloca_params(<{ %struct.S }>* inalloca)
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index e4d0373299fb..f0341277851d 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
-; then can be matched provided that the operands are swapped.
+; them can be matched provided that the operands are swapped.
 ; Some of them can't be matched at all and require a comparison
 ; and a conditional branch.
 
diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..217be9aeae3a
--- /dev/null
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
+
+define <2 x double> @test_mm_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_addsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    addsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    addsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_addsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    addsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    addsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hadd_pd:
+; X32:       # BB#0:
+; X32-NEXT:    haddpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_pd:
+; X64:       # BB#0:
+; X64-NEXT:    haddpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hadd_ps:
+; X32:       # BB#0:
+; X32-NEXT:    haddps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_ps:
+; X64:       # BB#0:
+; X64-NEXT:    haddps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    hsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    hsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    hsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    hsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
+; X32-LABEL: test_mm_lddqu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    lddqu (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_lddqu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    lddqu (%rdi), %xmm0
+; X64-NEXT:    retq
+  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+
+define <2 x double> @test_mm_loaddup_pd(double* %a0) {
+; X32-LABEL: test_mm_loaddup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movddup (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loaddup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup (%rdi), %xmm0
+; X64-NEXT:    retq
+  %ld = load double, double* %a0
+  %res0 = insertelement <2 x double> undef, double %ld, i32 0
+  %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+  ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_movedup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movedup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_movehdup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movehdup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_moveldup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_moveldup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..f93a16a5eb3d
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse4a-builtins.c
+
+define <2 x i64> @test_mm_extracti_si64(<2 x i64> %x) {
+; X32-LABEL: test_mm_extracti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq $2, $3, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extracti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq $2, $3, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_extract_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_extract_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extract_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %bc = bitcast <2 x i64> %y to <16 x i8>
+  %res = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %bc)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_inserti_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_inserti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_inserti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_insert_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_insert_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define void @test_stream_sd(i8* %p, <2 x double> %a) {
+; X32-LABEL: test_stream_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_stream_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movntsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) nounwind readnone
+
+define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
+; X32-LABEL: test_mm_stream_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntss %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movntss %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..4f7ff20c6e0d
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -0,0 +1,290 @@
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
+
+define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pabsb %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pabsb %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %arg)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pabsw %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pabsw %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %arg)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pabsd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pabsd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %arg)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_alignr_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_alignr_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %res = bitcast <16 x i8> %shuf to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phaddd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phaddd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadds_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadds_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phsubd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phsubd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maddubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmaddubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maddubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmaddubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhrs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmulhrsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mulhrs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmulhrsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shuffle_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pshufb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pshufb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    psignb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    psignb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psignw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psignw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psignd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psignd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
new file mode 100644
index 000000000000..5f109f09aa19
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+adx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 6f594868c7ad..b8cb7b1280ad 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}15.38%) BB#6({{[0-9a-fx/= ]+}}7.69%) BB#2({{[0-9a-fx/= ]+}}15.38%) BB#3({{[0-9a-fx/= ]+}}15.38%) BB#4({{[0-9a-fx/= ]+}}15.38%) BB#5({{[0-9a-fx/= ]+}}30.77%)
 }
 
 ; CHECK-LABEL: test3
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index c522ba60d6bf..bf778e5bad2b 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -277,8 +277,8 @@ declare fastcc %union.tree_node* @default_conversion(%union.tree_node*) nounwind
 
 ; CHECK-LABEL: foo:
 ; CHECK:        callq func
-; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   popq
+; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   ret
 
 define void @foo(i1* %V) nounwind {
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 10fe1e94bbdc..235230e3c6a8 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -36,9 +36,13 @@ entry:
 define i32 @f3() {
 ; X32-LABEL: f3:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %eax
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
 ; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
@@ -56,9 +60,13 @@ entry:
 define i32* @f4() {
 ; X32-LABEL: f4:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %ecx
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
 ; X32-NEXT: movl %gs:0, %eax
 ; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 960b5f27cf53..54f33b2bd224 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 0d67ac4bc25a..1f36d064f873 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -14,10 +14,10 @@
 ; CST-NEXT: .long 65535 # 0xffff
 
 ; CST: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
 
 ; AVX2: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
 ; AVX2-NEXT: .long 1199570944 # float 65536
@@ -69,10 +69,10 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
 ; AVX-NEXT: .long 65535 # 0xffff
 
 ; AVX: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
 
 ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
 ; AVX2-NEXT: .long 1199570944 # float 65536
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index 46cfcd9a9a12..ce0c11b2fa2a 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -23,10 +23,10 @@
 ; CST-NEXT: .long	1392508928              ## 0x53000000
 
 ; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
 
 ; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
 ; AVX2-NEXT: .long	1258291200              ## 0x4b000000
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 8c6d5105d820..2dcda842920e 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1357,8 +1357,7 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 ;
 ; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64:
 ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %xmm0
-; AVX512VL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %ymm0
 ; AVX512VL-NEXT:    retq
   %v = load <2 x i64>, <2 x i64>* %ptr
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -1366,21 +1365,10 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 }
 
 define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
-; AVX1-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: splat_mem_v4f64_from_v2f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
   %v = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x double> %shuffle
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
index 1a90b6238f26..3a8ef2d0b916 100644
--- a/test/CodeGen/X86/win32-pic-jumptable.ll
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -relocation-model=pic | FileCheck %s
 
 ; CHECK:        calll L0$pb
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT: L0$pb:
 ; CHECK-NEXT:   popl %eax
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:   addl LJTI0_0(,%ecx,4), %eax
 ; CHECK-NEXT:   jmpl *%eax
 
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 248a9202e997..99b27efe7f54 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -39,10 +39,10 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin
 ; Also test the general purpose constant folding of int->fp.
 define void @foo2(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI2_0:
-; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
-; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
-; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
-; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
+; CHECK-NEXT: .long 1082130432              ## float 4
+; CHECK-NEXT: .long 1084227584              ## float 5
+; CHECK-NEXT: .long 1086324736              ## float 6
+; CHECK-NEXT: .long 1088421888              ## float 7
 ; CHECK-LABEL: foo2:
 ; CHECK:  movaps LCPI2_0(%rip), %xmm0
 
@@ -72,10 +72,10 @@ define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
 ; Test the general purpose constant folding of uint->fp.
 define void @foo4(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI4_0:
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1123942400              ## float 1.270000e+02
-; CHECK-NEXT: .long 1124073472              ## float 1.280000e+02
-; CHECK-NEXT: .long 1132396544              ## float 2.550000e+02
+; CHECK-NEXT: .long 1065353216              ## float 1
+; CHECK-NEXT: .long 1123942400              ## float 127
+; CHECK-NEXT: .long 1124073472              ## float 128
+; CHECK-NEXT: .long 1132396544              ## float 255
 ; CHECK-LABEL: foo4:
 ; CHECK:  movaps LCPI4_0(%rip), %xmm0
 
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
new file mode 100644
index 000000000000..7c00f407b1e0
--- /dev/null
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -0,0 +1,153 @@
+; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+;
+; This test checks that we do not use shrink-wrapping when
+; the function does not have any frame pointer and may unwind.
+; This is a workaround for a limitation in the emission of
+; the CFI directives, that are not correct in such case.
+; PR25614
+;
+; Note: This test cannot be merged with the shrink-wrapping tests
+; because the booleans set on the command line take precedence on
+; the target logic that disable shrink-wrapping.
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; No shrink-wrapping should occur here, until the CFI information are fixed.
+; CHECK-LABEL: framelessUnwind:
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; CHECK-NEXT: popq
+;
+; CHECK-NEXT: retq
+define i32 @framelessUnwind(i32 %a, i32 %b) #0 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+declare i32 @doSomething(i32, i32*)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }
+
+; Shrink-wrapping should occur here. We have a frame pointer.
+; CHECK-LABEL: frameUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], -4(%rbp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq -4(%rbp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code. (What we pop does not matter.)
+; CHECK: popq %rbp
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @frameUnwind(i32 %a, i32 %b) #1 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #1 = { "no-frame-pointer-elim"="true" }
+
+; Shrink-wrapping should occur here. We do not have to unwind.
+; CHECK-LABEL: framelessnoUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code.
+; CHECK-NEXT: addq
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
diff --git a/test/DebugInfo/dwarfdump-dwp.test b/test/DebugInfo/dwarfdump-dwp.test
index af5de5067e69..8aef636d4d9a 100644
--- a/test/DebugInfo/dwarfdump-dwp.test
+++ b/test/DebugInfo/dwarfdump-dwp.test
@@ -39,15 +39,15 @@ RUN: llvm-dwarfdump %p/Inputs/dwarfdump-dwp.x86_64.o | FileCheck %s
 ; CHECK-NEXT: version = 2 slots = 16
 ; CHECK:      Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
 ; CHECK-NEXT: ----- ------------------ ------------------------ ------------------------ ------------------------ ------------------------
-; CHECK-NEXT:     2 0xfef104c25502f092 [0x0000002d, 0x0000005f) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
-; CHECK-NEXT:     8 0x03c30756e2d45008 [0x00000000, 0x0000002d) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+; CHECK-NEXT:     3 0xfef104c25502f092 [0x0000002d, 0x0000005f) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+; CHECK-NEXT:     9 0x03c30756e2d45008 [0x00000000, 0x0000002d) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
 
 ; CHECK: .debug_tu_index contents:
 ; CHECK-NEXT: version = 2 slots = 16
 ; CHECK:      Index Signature          TYPES                    ABBREV                   LINE                     STR_OFFSETS
 ; CHECK-NEXT: ----- ------------------ ------------------------ ------------------------ ------------------------ ------------------------
-; CHECK-NEXT:     8 0x1d02f3be30cc5688 [0x00000024, 0x00000048) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
-; CHECK-NEXT:    12 0x3875c0e21cda63fc [0x00000000, 0x00000024) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+; CHECK-NEXT:     9 0x1d02f3be30cc5688 [0x00000024, 0x00000048) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+; CHECK-NEXT:    13 0x3875c0e21cda63fc [0x00000000, 0x00000024) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
 
 ; TODO: use the index section offset info to correctly dump strings in debug info
 ; TODO: use the index section offset info to correctly dump file names in debug info
diff --git a/test/Instrumentation/AddressSanitizer/twice.ll b/test/Instrumentation/AddressSanitizer/twice.ll
new file mode 100644
index 000000000000..9f7826f73952
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/twice.ll
@@ -0,0 +1,8 @@
+; Check that the address sanitizer pass can be reused
+; RUN: opt < %s -S -run-twice -asan
+
+define void @foo(i64* %b) nounwind uwtable sanitize_address {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
index 659c03040f2f..71fdbbb5ada7 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll
@@ -29,8 +29,8 @@ entry:
 }
 
 ; CHECK0-NOT: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
-; CHECK1: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
-; CHECK2: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
+; CHECK1: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor, i8* null }
+; CHECK2: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor, i8* null }
 
 ; CHECK0-NOT: call void @__sanitizer_cov(
 ; CHECK0-NOT: call void @__sanitizer_cov_module_init(
diff --git a/test/Linker/2003-04-23-LinkOnceLost.ll b/test/Linker/2003-04-23-LinkOnceLost.ll
index c699d1eb058e..e2b600c877c0 100644
--- a/test/Linker/2003-04-23-LinkOnceLost.ll
+++ b/test/Linker/2003-04-23-LinkOnceLost.ll
@@ -4,3 +4,7 @@
 
 declare void @foo()
 
+define void @use_foo() {
+  call void @foo()
+  ret void
+}
diff --git a/test/Linker/2003-05-31-LinkerRename.ll b/test/Linker/2003-05-31-LinkerRename.ll
index e10e239071ad..f511be1bf22f 100644
--- a/test/Linker/2003-05-31-LinkerRename.ll
+++ b/test/Linker/2003-05-31-LinkerRename.ll
@@ -1,6 +1,4 @@
-; RUN: llvm-as %S/Inputs/2003-05-31-LinkerRename.ll -o %t.1.bc
-; RUN: llvm-as  %s -o %t.2.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
+; RUN: llvm-link %S/Inputs/2003-05-31-LinkerRename.ll %s -S | FileCheck %s
 
 ; CHECK: @bar = global i32 ()* @foo.2
 
@@ -8,13 +6,13 @@
 ; CHECK-NEXT:   ret i32 7
 ; CHECK-NEXT: }
 
-; CHECK: declare i32 @foo()
-
 ; CHECK:      define i32 @test() {
 ; CHECK-NEXT:   %X = call i32 @foo()
 ; CHECK-NEXT:   ret i32 %X
 ; CHECK-NEXT: }
 
+; CHECK: declare i32 @foo()
+
 declare i32 @foo()
 
 define i32 @test() {
diff --git a/test/Linker/ConstantGlobals.ll b/test/Linker/ConstantGlobals.ll
index 49f86a51bd7f..58c0d711a07e 100644
--- a/test/Linker/ConstantGlobals.ll
+++ b/test/Linker/ConstantGlobals.ll
@@ -6,3 +6,7 @@
 
 ; CHECK-DAG: @Y = external global [1 x i32]
 @Y = external global [1 x i32]
+
+define [1 x i32]* @use-Y() {
+  ret [1 x i32] *@Y
+}
diff --git a/test/Linker/Inputs/available_externally_over_decl.ll b/test/Linker/Inputs/available_externally_over_decl.ll
new file mode 100644
index 000000000000..6bd0a939957b
--- /dev/null
+++ b/test/Linker/Inputs/available_externally_over_decl.ll
@@ -0,0 +1,10 @@
+@h = global void ()* @f
+@h2 = global void ()* @g
+
+define available_externally void @f() {
+  ret void
+}
+
+define available_externally void @g() {
+  ret void
+}
diff --git a/test/Linker/Inputs/funcimport_appending_global.ll b/test/Linker/Inputs/funcimport_appending_global.ll
new file mode 100644
index 000000000000..413b890b02ad
--- /dev/null
+++ b/test/Linker/Inputs/funcimport_appending_global.ll
@@ -0,0 +1,6 @@
+@v = weak global i8 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo, i8* @v}]
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Linker/Inputs/internalize-lazy.ll b/test/Linker/Inputs/internalize-lazy.ll
new file mode 100644
index 000000000000..43f9a7ab7455
--- /dev/null
+++ b/test/Linker/Inputs/internalize-lazy.ll
@@ -0,0 +1,8 @@
+define linkonce_odr void @g() {
+  ret void
+}
+
+define void @f() {
+  call void @g()
+  ret void
+}
diff --git a/test/Linker/Inputs/opaque.ll b/test/Linker/Inputs/opaque.ll
index f164abd586d1..a5f27cba418e 100644
--- a/test/Linker/Inputs/opaque.ll
+++ b/test/Linker/Inputs/opaque.ll
@@ -11,3 +11,11 @@ define void @f1()  {
   getelementptr %A, %A* null, i32 0
   ret void
 }
+
+define %A* @use_g2() {
+ ret %A* @g2
+}
+
+define %B* @use_g3() {
+  ret %B* @g3
+}
diff --git a/test/Linker/Inputs/testlink.ll b/test/Linker/Inputs/testlink.ll
index 263d9e77d1ab..22a66399be09 100644
--- a/test/Linker/Inputs/testlink.ll
+++ b/test/Linker/Inputs/testlink.ll
@@ -53,4 +53,6 @@ define internal void @testIntern() {
   ret void
 }
 
-declare void @VecSizeCrash1(%VecSize)
+define void @VecSizeCrash1(%VecSize) {
+  ret void
+}
diff --git a/test/Linker/Inputs/type-unique-dst-types2.ll b/test/Linker/Inputs/type-unique-dst-types2.ll
index b565c6d73649..7770ea3cca07 100644
--- a/test/Linker/Inputs/type-unique-dst-types2.ll
+++ b/test/Linker/Inputs/type-unique-dst-types2.ll
@@ -1,3 +1,7 @@
 %A.11 = type { %B }
 %B = type { i8 }
 @g1 = external global %A.11
+
+define %A.11* @use_g1() {
+  ret %A.11* @g1
+}
diff --git a/test/Linker/Inputs/type-unique-dst-types3.ll b/test/Linker/Inputs/type-unique-dst-types3.ll
index c5794ad839a2..8a5ac2694791 100644
--- a/test/Linker/Inputs/type-unique-dst-types3.ll
+++ b/test/Linker/Inputs/type-unique-dst-types3.ll
@@ -1,2 +1,6 @@
 %A.11 = type opaque
 @g2 = external global %A.11
+
+define %A.11* @use_g2() {
+  ret %A.11* @g2
+}
diff --git a/test/Linker/alias.ll b/test/Linker/alias.ll
index 4e2072396d47..ae9da70174e8 100644
--- a/test/Linker/alias.ll
+++ b/test/Linker/alias.ll
@@ -1,16 +1,37 @@
-; RUN: llvm-link %s %S/Inputs/alias.ll -S -o - | FileCheck %s
-; RUN: llvm-link %S/Inputs/alias.ll %s -S -o - | FileCheck %s
+; RUN: llvm-link %s %S/Inputs/alias.ll -S -o - | FileCheck --check-prefix=C1 %s
+; RUN: llvm-link %S/Inputs/alias.ll %s -S -o - | FileCheck --check-prefix=C2 %s
+
+; FIXME:
+; The C1 direction is incorrect.
+; When moving an alias to an existing module and we want to discard the aliasee
+; (the C2 case), the IRMover knows to copy the aliasee as internal.
+; When moving a replacement to an aliasee to a module that has an alias (C1),
+; a replace all uses with blindly changes the alias.
+; The C1 case doesn't happen when using a system linker with a plugin because
+; the linker does full symbol resolution first.
+; Given that this is a problem only with llvm-link and its 1 module at a time
+; linking, it should probably learn to changes the aliases in the destination
+; before using the IRMover.
 
 @foo = weak global i32 0
-; CHECK-DAG: @foo = alias i32, i32* @zed
+; C1-DAG: @foo = alias i32, i32* @zed
+; C2-DAG: @foo = alias i32, i32* @zed
 
 @bar = alias i32, i32* @foo
-; CHECK-DAG: @bar = alias i32, i32* @foo
+; C1-DAG: @bar = alias i32, i32* @foo
+
+; C2-DAG: @foo.1 = internal global i32 0
+; C2-DAG: @bar = alias i32, i32* @foo.1
 
 @foo2 = weak global i32 0
-; CHECK-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
+; C1-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
+; C2-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
 
 @bar2 = alias i32, i32* @foo2
-; CHECK-DAG: @bar2 = alias i32, bitcast (i16* @foo2 to i32*)
+; C1-DAG: @bar2 = alias i32, bitcast (i16* @foo2 to i32*)
+
+; C2-DAG: @foo2.2 = internal global i32 0
+; C2-DAG: @bar2 = alias i32, i32* @foo2.2
 
-; CHECK-DAG: @zed = global i32 42
+; C1-DAG: @zed = global i32 42
+; C2-DAG: @zed = global i32 42
diff --git a/test/Linker/available_externally_over_decl.ll b/test/Linker/available_externally_over_decl.ll
new file mode 100644
index 000000000000..0104967ef544
--- /dev/null
+++ b/test/Linker/available_externally_over_decl.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-link -S %s %p/Inputs/available_externally_over_decl.ll | FileCheck %s
+
+declare void @f()
+
+define available_externally void @g() {
+  ret void
+}
+
+define void ()* @main() {
+  call void @g()
+  ret void ()* @f
+}
+
+; CHECK-DAG: define available_externally void @g() {
+; CHECK-DAG: define available_externally void @f() {
diff --git a/test/Linker/ctors5.ll b/test/Linker/ctors5.ll
new file mode 100644
index 000000000000..99124061bb32
--- /dev/null
+++ b/test/Linker/ctors5.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link -S %s | FileCheck %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null }]
+
+define void @f() {
+  ret void
+}
diff --git a/test/Linker/funcimport_appending_global.ll b/test/Linker/funcimport_appending_global.ll
new file mode 100644
index 000000000000..190d31ee8c7f
--- /dev/null
+++ b/test/Linker/funcimport_appending_global.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as -function-summary %s -o %t.bc
+; RUN: llvm-as -function-summary %p/Inputs/funcimport_appending_global.ll -o %t2.bc
+; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
+
+; Do the import now
+; RUN: llvm-link %t.bc -functionindex=%t3.thinlto.bc -import=foo:%t2.bc -S | FileCheck %s
+
+; Ensure that global constructor (appending linkage) is not imported
+; CHECK-NOT: @llvm.global_ctors = {{.*}}@foo
+
+declare void @f()
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}]
+
+define i32 @main() {
+entry:
+  call void @foo()
+  ret i32 0
+}
+
+declare void @foo()
diff --git a/test/Linker/internalize-lazy.ll b/test/Linker/internalize-lazy.ll
new file mode 100644
index 000000000000..480335927b51
--- /dev/null
+++ b/test/Linker/internalize-lazy.ll
@@ -0,0 +1,4 @@
+; RUN: llvm-link -S -internalize %s %p/Inputs/internalize-lazy.ll | FileCheck %s
+
+; CHECK: define internal void @f
+; CHECK: define internal void @g
diff --git a/test/Linker/link-flags.ll b/test/Linker/link-flags.ll
index d03503aa4548..c901b699575a 100644
--- a/test/Linker/link-flags.ll
+++ b/test/Linker/link-flags.ll
@@ -1,9 +1,7 @@
-; RUN: llvm-as %S/Inputs/linkage.b.ll -o %t.b.bc
-; RUN: llvm-as %S/Inputs/linkage.c.ll -o %t.c.bc
-; RUN: llvm-link -S %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CU
-; RUN: llvm-link -S -only-needed %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CN
-; RUN: llvm-link -S -internalize %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=CI
-; RUN: llvm-link -S -internalize -only-needed %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=CN
+; RUN: llvm-link -S %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CU
+; RUN: llvm-link -S -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CN
+; RUN: llvm-link -S -internalize %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CI
+; RUN: llvm-link -S -internalize -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CN
 
 C-LABEL: @X = global i32 5
 CI-LABEL: @X = internal global i32 5
diff --git a/test/Linker/opaque.ll b/test/Linker/opaque.ll
index 4f3f398f8f1b..6fd1ae90d4f4 100644
--- a/test/Linker/opaque.ll
+++ b/test/Linker/opaque.ll
@@ -19,3 +19,7 @@
 %C = type { %A }
 
 @g1 = external global %B
+
+define %B* @use_g1() {
+  ret %B* @g1
+}
diff --git a/test/Linker/testlink.ll b/test/Linker/testlink.ll
index 82a2229f57a0..6a316a3bf846 100644
--- a/test/Linker/testlink.ll
+++ b/test/Linker/testlink.ll
@@ -32,6 +32,11 @@
 
 ; CHECK-DAG: @0 = external global i32
 @0 = external global i32
+
+define i32* @use0() {
+  ret i32* @0
+}
+
 ; CHECK-DAG: @Inte = global i32 1
 @Inte = global i32 1
 
@@ -101,4 +106,6 @@ define void @testIntern() {
   ret void
 }
 
-declare void @VecSizeCrash(%VecSize)
+define void @VecSizeCrash(%VecSize) {
+  ret void
+}
diff --git a/test/Linker/type-unique-dst-types.ll b/test/Linker/type-unique-dst-types.ll
index 30aecbb970cb..1adad49de91d 100644
--- a/test/Linker/type-unique-dst-types.ll
+++ b/test/Linker/type-unique-dst-types.ll
@@ -17,3 +17,7 @@
 %A = type { %B }
 %B = type { i8 }
 @g3 = external global %A
+
+define %A* @use_g3() {
+  ret %A* @g3
+}
diff --git a/test/Linker/type-unique-src-type.ll b/test/Linker/type-unique-src-type.ll
index 110ecc87e1b1..ab7322892e07 100644
--- a/test/Linker/type-unique-src-type.ll
+++ b/test/Linker/type-unique-src-type.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT: %B = type { %A }
 ; CHECK-NEXT: %A = type { i8 }
 
-; CHECK: @g1 = external global %C.0
+; CHECK: @g1 = global %C.0 zeroinitializer
 ; CHECK:  getelementptr %C.0, %C.0* null, i64 0, i32 0, i32 0
 
 %A   = type { i8 }
@@ -21,4 +21,4 @@ define void @f1() {
   getelementptr %C, %C* null, i64 0, i32 0, i32 0
   ret void
 }
-@g1 = external global %C.0
+@g1 = global %C.0 zeroinitializer
diff --git a/test/Linker/unnamed-addr1-a.ll b/test/Linker/unnamed-addr1-a.ll
index 6ff0fd90b687..a96f089a99c0 100644
--- a/test/Linker/unnamed-addr1-a.ll
+++ b/test/Linker/unnamed-addr1-a.ll
@@ -15,6 +15,11 @@ define weak void @func-b() unnamed_addr { ret void }
 @global-c = common unnamed_addr global i32 0
 ; CHECK-DAG: @global-c = common unnamed_addr global i32 0
 @global-d = external global i32
+
+define i32* @use-global-d() {
+  ret i32* @global-d
+}
+
 ; CHECK-DAG: @global-d = global i32 42
 @global-e = external unnamed_addr global i32
 ; CHECK-DAG: @global-e = unnamed_addr global i32 42
@@ -27,6 +32,11 @@ define weak void @func-b() unnamed_addr { ret void }
 ; CHECK-DAG: @alias-b = unnamed_addr alias i32, i32* @global-f
 
 declare void @func-c()
+define void @use-func-c() {
+  call void @func-c()
+  ret void
+}
+
 ; CHECK-DAG: define weak void @func-c() {
 define weak void @func-d() { ret void }
 ; CHECK-DAG: define weak void @func-d() {
diff --git a/test/Linker/weakextern.ll b/test/Linker/weakextern.ll
index 8d479a0d39b3..814550a907bb 100644
--- a/test/Linker/weakextern.ll
+++ b/test/Linker/weakextern.ll
@@ -1,12 +1,12 @@
-; RUN: llvm-as < %s > %t.bc
-; RUN: llvm-as < %p/testlink.ll > %t2.bc
-; RUN: llvm-link %t.bc %t.bc %t2.bc -o %t1.bc
-; RUN: llvm-dis < %t1.bc | FileCheck %s
+; RUN: llvm-link %s %s %p/testlink.ll -S | FileCheck %s
 ; CHECK: kallsyms_names = extern_weak
 ; CHECK: Inte = global i32
 ; CHECK: MyVar = external global i32
 
-@kallsyms_names = extern_weak global [0 x i8]		; <[0 x i8]*> [#uses=0]
-@MyVar = extern_weak global i32		; <i32*> [#uses=0]
-@Inte = extern_weak global i32		; <i32*> [#uses=0]
+@kallsyms_names = extern_weak global [0 x i8]
+@MyVar = extern_weak global i32
+@Inte = extern_weak global i32
 
+define weak [0 x i8]* @use_kallsyms_names() {
+  ret [0 x i8]* @kallsyms_names
+}
diff --git a/test/MC/AArch64/arm64-advsimd.s b/test/MC/AArch64/arm64-advsimd.s
index c627de708d31..294f09082916 100644
--- a/test/MC/AArch64/arm64-advsimd.s
+++ b/test/MC/AArch64/arm64-advsimd.s
@@ -1,4 +1,4 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto,fullfp16 -output-asm-variant=1 -show-encoding < %s | FileCheck %s
 
 foo:
 
@@ -440,6 +440,106 @@ foo:
 ; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
 ; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
 
+  fabd.4h v0, v0, v0
+  facge.4h  v0, v0, v0
+  facgt.4h  v0, v0, v0
+  faddp.4h v0, v0, v0
+  fadd.4h v0, v0, v0
+  fcmeq.4h  v0, v0, v0
+  fcmge.4h  v0, v0, v0
+  fcmgt.4h  v0, v0, v0
+  fdiv.4h v0, v0, v0
+  fmaxnmp.4h v0, v0, v0
+  fmaxnm.4h v0, v0, v0
+  fmaxp.4h v0, v0, v0
+  fmax.4h v0, v0, v0
+  fminnmp.4h v0, v0, v0
+  fminnm.4h v0, v0, v0
+  fminp.4h v0, v0, v0
+  fmin.4h v0, v0, v0
+  fmla.4h v0, v0, v0
+  fmls.4h v0, v0, v0
+  fmulx.4h v0, v0, v0
+  fmul.4h v0, v0, v0
+  frecps.4h v0, v0, v0
+  frsqrts.4h v0, v0, v0
+  fsub.4h v0, v0, v0
+
+; CHECK: fabd.4h    v0, v0, v0     ; encoding: [0x00,0x14,0xc0,0x2e]
+; CHECK: facge.4h   v0, v0, v0     ; encoding: [0x00,0x2c,0x40,0x2e]
+; CHECK: facgt.4h   v0, v0, v0     ; encoding: [0x00,0x2c,0xc0,0x2e]
+; CHECK: faddp.4h   v0, v0, v0     ; encoding: [0x00,0x14,0x40,0x2e]
+; CHECK: fadd.4h    v0, v0, v0     ; encoding: [0x00,0x14,0x40,0x0e]
+; CHECK: fcmeq.4h   v0, v0, v0     ; encoding: [0x00,0x24,0x40,0x0e]
+; CHECK: fcmge.4h   v0, v0, v0     ; encoding: [0x00,0x24,0x40,0x2e]
+; CHECK: fcmgt.4h   v0, v0, v0     ; encoding: [0x00,0x24,0xc0,0x2e]
+; CHECK: fdiv.4h    v0, v0, v0     ; encoding: [0x00,0x3c,0x40,0x2e]
+; CHECK: fmaxnmp.4h v0, v0, v0     ; encoding: [0x00,0x04,0x40,0x2e]
+; CHECK: fmaxnm.4h  v0, v0, v0     ; encoding: [0x00,0x04,0x40,0x0e]
+; CHECK: fmaxp.4h   v0, v0, v0     ; encoding: [0x00,0x34,0x40,0x2e]
+; CHECK: fmax.4h    v0, v0, v0     ; encoding: [0x00,0x34,0x40,0x0e]
+; CHECK: fminnmp.4h v0, v0, v0     ; encoding: [0x00,0x04,0xc0,0x2e]
+; CHECK: fminnm.4h  v0, v0, v0     ; encoding: [0x00,0x04,0xc0,0x0e]
+; CHECK: fminp.4h   v0, v0, v0     ; encoding: [0x00,0x34,0xc0,0x2e]
+; CHECK: fmin.4h    v0, v0, v0     ; encoding: [0x00,0x34,0xc0,0x0e]
+; CHECK: fmla.4h    v0, v0, v0     ; encoding: [0x00,0x0c,0x40,0x0e]
+; CHECK: fmls.4h    v0, v0, v0     ; encoding: [0x00,0x0c,0xc0,0x0e]
+; CHECK: fmulx.4h   v0, v0, v0     ; encoding: [0x00,0x1c,0x40,0x0e]
+; CHECK: fmul.4h    v0, v0, v0     ; encoding: [0x00,0x1c,0x40,0x2e]
+; CHECK: frecps.4h  v0, v0, v0     ; encoding: [0x00,0x3c,0x40,0x0e]
+; CHECK: frsqrts.4h v0, v0, v0     ; encoding: [0x00,0x3c,0xc0,0x0e]
+; CHECK: fsub.4h    v0, v0, v0     ; encoding: [0x00,0x14,0xc0,0x0e]
+
+  fabd.8h v0, v0, v0
+  facge.8h  v0, v0, v0
+  facgt.8h  v0, v0, v0
+  faddp.8h v0, v0, v0
+  fadd.8h v0, v0, v0
+  fcmeq.8h  v0, v0, v0
+  fcmge.8h  v0, v0, v0
+  fcmgt.8h  v0, v0, v0
+  fdiv.8h v0, v0, v0
+  fmaxnmp.8h v0, v0, v0
+  fmaxnm.8h v0, v0, v0
+  fmaxp.8h v0, v0, v0
+  fmax.8h v0, v0, v0
+  fminnmp.8h v0, v0, v0
+  fminnm.8h v0, v0, v0
+  fminp.8h v0, v0, v0
+  fmin.8h v0, v0, v0
+  fmla.8h v0, v0, v0
+  fmls.8h v0, v0, v0
+  fmulx.8h v0, v0, v0
+  fmul.8h v0, v0, v0
+  frecps.8h v0, v0, v0
+  frsqrts.8h v0, v0, v0
+  fsub.8h v0, v0, v0
+
+; CHECK: fabd.8h v0, v0, v0              ; encoding: [0x00,0x14,0xc0,0x6e]
+; CHECK: facge.8h        v0, v0, v0      ; encoding: [0x00,0x2c,0x40,0x6e]
+; CHECK: facgt.8h        v0, v0, v0      ; encoding: [0x00,0x2c,0xc0,0x6e]
+; CHECK: faddp.8h        v0, v0, v0      ; encoding: [0x00,0x14,0x40,0x6e]
+; CHECK: fadd.8h v0, v0, v0              ; encoding: [0x00,0x14,0x40,0x4e]
+; CHECK: fcmeq.8h        v0, v0, v0      ; encoding: [0x00,0x24,0x40,0x4e]
+; CHECK: fcmge.8h        v0, v0, v0      ; encoding: [0x00,0x24,0x40,0x6e]
+; CHECK: fcmgt.8h        v0, v0, v0      ; encoding: [0x00,0x24,0xc0,0x6e]
+; CHECK: fdiv.8h v0, v0, v0              ; encoding: [0x00,0x3c,0x40,0x6e]
+; CHECK: fmaxnmp.8h      v0, v0, v0      ; encoding: [0x00,0x04,0x40,0x6e]
+; CHECK: fmaxnm.8h       v0, v0, v0      ; encoding: [0x00,0x04,0x40,0x4e]
+; CHECK: fmaxp.8h        v0, v0, v0      ; encoding: [0x00,0x34,0x40,0x6e]
+; CHECK: fmax.8h v0, v0, v0              ; encoding: [0x00,0x34,0x40,0x4e]
+; CHECK: fminnmp.8h      v0, v0, v0      ; encoding: [0x00,0x04,0xc0,0x6e]
+; CHECK: fminnm.8h       v0, v0, v0      ; encoding: [0x00,0x04,0xc0,0x4e]
+; CHECK: fminp.8h        v0, v0, v0      ; encoding: [0x00,0x34,0xc0,0x6e]
+; CHECK: fmin.8h v0, v0, v0              ; encoding: [0x00,0x34,0xc0,0x4e]
+; CHECK: fmla.8h v0, v0, v0              ; encoding: [0x00,0x0c,0x40,0x4e]
+; CHECK: fmls.8h v0, v0, v0              ; encoding: [0x00,0x0c,0xc0,0x4e]
+; CHECK: fmulx.8h        v0, v0, v0      ; encoding: [0x00,0x1c,0x40,0x4e]
+; CHECK: fmul.8h v0, v0, v0              ; encoding: [0x00,0x1c,0x40,0x6e]
+; CHECK: frecps.8h       v0, v0, v0      ; encoding: [0x00,0x3c,0x40,0x4e]
+; CHECK: frsqrts.8h      v0, v0, v0      ; encoding: [0x00,0x3c,0xc0,0x4e]
+; CHECK: fsub.8h v0, v0, v0              ; encoding: [0x00,0x14,0xc0,0x4e]
+
   bif.8b v0, v0, v0
   bit.8b v0, v0, v0
   bsl.8b v0, v0, v0
@@ -568,6 +668,57 @@ foo:
 ; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
 ; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
 
+  fabs.4h     v0, v0
+  fneg.4h     v0, v0
+  frecpe.4h   v0, v0
+  frinta.4h   v0, v0
+  frintx.4h   v0, v0
+  frinti.4h   v0, v0
+  frintm.4h   v0, v0
+  frintn.4h   v0, v0
+  frintp.4h   v0, v0
+  frintz.4h   v0, v0
+  frsqrte.4h  v0, v0
+  fsqrt.4h    v0, v0
+
+; CHECK: fabs.4h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x0e]
+; CHECK: fneg.4h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x2e]
+; CHECK: frecpe.4h       v0, v0          ; encoding: [0x00,0xd8,0xf9,0x0e]
+; CHECK: frinta.4h       v0, v0          ; encoding: [0x00,0x88,0x79,0x2e]
+; CHECK: frintx.4h       v0, v0          ; encoding: [0x00,0x98,0x79,0x2e]
+; CHECK: frinti.4h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x2e]
+; CHECK: frintm.4h       v0, v0          ; encoding: [0x00,0x98,0x79,0x0e]
+; CHECK: frintn.4h       v0, v0          ; encoding: [0x00,0x88,0x79,0x0e]
+; CHECK: frintp.4h       v0, v0          ; encoding: [0x00,0x88,0xf9,0x0e]
+; CHECK: frintz.4h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x0e]
+; CHECK: frsqrte.4h      v0, v0          ; encoding: [0x00,0xd8,0xf9,0x2e]
+; CHECK: fsqrt.4h        v0, v0          ; encoding: [0x00,0xf8,0xf9,0x2e]
+
+  fabs.8h     v0, v0
+  fneg.8h     v0, v0
+  frecpe.8h   v0, v0
+  frinta.8h   v0, v0
+  frintx.8h   v0, v0
+  frinti.8h   v0, v0
+  frintm.8h   v0, v0
+  frintn.8h   v0, v0
+  frintp.8h   v0, v0
+  frintz.8h   v0, v0
+  frsqrte.8h  v0, v0
+  fsqrt.8h    v0, v0
+
+; CHECK: fabs.8h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x4e]
+; CHECK: fneg.8h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x6e]
+; CHECK: frecpe.8h       v0, v0          ; encoding: [0x00,0xd8,0xf9,0x4e]
+; CHECK: frinta.8h       v0, v0          ; encoding: [0x00,0x88,0x79,0x6e]
+; CHECK: frintx.8h       v0, v0          ; encoding: [0x00,0x98,0x79,0x6e]
+; CHECK: frinti.8h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x6e]
+; CHECK: frintm.8h       v0, v0          ; encoding: [0x00,0x98,0x79,0x4e]
+; CHECK: frintn.8h       v0, v0          ; encoding: [0x00,0x88,0x79,0x4e]
+; CHECK: frintp.8h       v0, v0          ; encoding: [0x00,0x88,0xf9,0x4e]
+; CHECK: frintz.8h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x4e]
+; CHECK: frsqrte.8h      v0, v0          ; encoding: [0x00,0xd8,0xf9,0x6e]
+; CHECK: fsqrt.8h        v0, v0          ; encoding: [0x00,0xf8,0xf9,0x6e]
 
   cmeq.8b   v0, v0, #0
   cmeq.16b  v0, v0, #0
diff --git a/test/MC/AArch64/armv8.1a-rdma.s b/test/MC/AArch64/armv8.1a-rdma.s
index 1de2a0fb15dd..36158428d6c4 100644
--- a/test/MC/AArch64/armv8.1a-rdma.s
+++ b/test/MC/AArch64/armv8.1a-rdma.s
@@ -26,27 +26,9 @@
   sqrdmlsh v0.8s, v1.8s, v2.8s
   sqrdmlah v0.2s, v1.4h, v2.8h
   sqrdmlsh v0.4s, v1.8h, v2.2s
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                          ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                          ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
 // CHECK-ERROR:            ^
diff --git a/test/MC/AArch64/fullfp16-diagnostics.s b/test/MC/AArch64/fullfp16-diagnostics.s
new file mode 100644
index 000000000000..06035dbf7028
--- /dev/null
+++ b/test/MC/AArch64/fullfp16-diagnostics.s
@@ -0,0 +1,82 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
+// RUN: FileCheck < %t %s
+
+  fmla v0.4h, v1.4h, v16.h[3]
+  fmla v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmls v0.4h, v1.4h, v16.h[3]
+  fmls v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmul v0.4h, v1.4h, v16.h[3]
+  fmul v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmulx v0.4h, v1.4h, v16.h[3]
+  fmulx v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                     ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                     ^
+
+  fmla h0, h1, v16.h[3]
+  fmla h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmls h0, h1, v16.h[3]
+  fmls h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmul h0, h1, v16.h[3]
+  fmul h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmulx h0, h1, v16.h[3]
+  fmulx h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx h0, h1, v16.h[3]
+// CHECK-NEXT:               ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx h2, h3, v17.h[6]
+// CHECK-NEXT:               ^
diff --git a/test/MC/AArch64/fullfp16-neon-neg.s b/test/MC/AArch64/fullfp16-neon-neg.s
new file mode 100644
index 000000000000..0913ecb7e9ab
--- /dev/null
+++ b/test/MC/AArch64/fullfp16-neon-neg.s
@@ -0,0 +1,382 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+neon,-fullfp16 -show-encoding < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=-neon,+fullfp16 -show-encoding < %s 2>&1 | FileCheck %s
+
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs.4h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg.4h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte.4h  v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt.4h    v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs.8h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg.8h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte.8h  v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt.8h    v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v3.8h, v8.8h, v2.h[1]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v3.8h, v8.8h, v2.h[1]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul v0.8h, v1.8h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx v0.8h, v1.8h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabd v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fadd v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fadd v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsub v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsub v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h,  v12.4h, v8.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v31.8h, v28.8h, v29.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v0.4h, v16.4h, v31.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v4.8h, v15.8h, v7.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h, v20.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v1.8h, v8.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h, v20.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v1.8h, v8.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v16.4h, v2.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v15.8h, v4.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v16.4h, v2.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v15.8h, v4.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facle v0.4h, v16.4h, v31.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facle v4.8h, v15.8h, v7.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faclt v3.4h,  v12.4h, v8.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faclt v31.8h, v28.8h, v29.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxp v31.8h, v15.8h, v16.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminp v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminp v3.8h, v5.8h, v6.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmp v31.8h, v15.8h, v16.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmp v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmp v3.8h, v5.8h, v6.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmax v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmax v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmin v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmin v10.8h, v15.8h, v22.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnm v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnm v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnm v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnm v10.8h, v15.8h, v22.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabd h29, h24, h20
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla    h0, h1, v1.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls    h2, h3, v4.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul    h0, h1, v1.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx   h6, h2, v8.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs h21, h12, #1
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu h21, h12, #1
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx h20, h22, h15
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps h21, h16, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts h21, h5, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe h19, h14
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpx h18, h10
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp h18, v3.2h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt v6.8h, v8.8h
+
+// CHECK-NOT: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
diff --git a/test/MC/AArch64/neon-2velem.s b/test/MC/AArch64/neon-2velem.s
index 04841d0164f2..ed55ad0b1363 100644
--- a/test/MC/AArch64/neon-2velem.s
+++ b/test/MC/AArch64/neon-2velem.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -46,6 +46,8 @@
 // CHECK: mls	v0.8h, v1.8h, v2.h[7]   // encoding: [0x20,0x48,0x72,0x6f]
 // CHECK: mls	v0.8h, v1.8h, v14.h[6]  // encoding: [0x20,0x48,0x6e,0x6f]
 
+        fmla v0.4h, v1.4h, v2.h[2]
+        fmla v3.8h, v8.8h, v2.h[1]
         fmla v0.2s, v1.2s, v2.s[2]
         fmla v0.2s, v1.2s, v22.s[2]
         fmla v3.4s, v8.4s, v2.s[1]
@@ -53,6 +55,8 @@
         fmla v0.2d, v1.2d, v2.d[1]
         fmla v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmla    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x10,0x22,0x0f]
+// CHECK: fmla    v3.8h, v8.8h, v2.h[1]   // encoding: [0x03,0x11,0x12,0x4f]
 // CHECK: fmla	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x18,0x82,0x0f]
 // CHECK: fmla	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x18,0x96,0x0f]
 // CHECK: fmla	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x11,0xa2,0x4f]
@@ -60,6 +64,8 @@
 // CHECK: fmla	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x18,0xc2,0x4f]
 // CHECK: fmla	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x18,0xd6,0x4f]
 
+        fmls v0.4h, v1.4h, v2.h[2]
+        fmls v3.8h, v8.8h, v2.h[1]
         fmls v0.2s, v1.2s, v2.s[2]
         fmls v0.2s, v1.2s, v22.s[2]
         fmls v3.4s, v8.4s, v2.s[1]
@@ -67,6 +73,8 @@
         fmls v0.2d, v1.2d, v2.d[1]
         fmls v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmls    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x50,0x22,0x0f]
+// CHECK: fmls    v3.8h, v8.8h, v2.h[1]   // encoding: [0x03,0x51,0x12,0x4f]
 // CHECK: fmls	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x58,0x82,0x0f]
 // CHECK: fmls	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x58,0x96,0x0f]
 // CHECK: fmls	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x51,0xa2,0x4f]
@@ -172,6 +180,8 @@
 // CHECK: mul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x88,0x82,0x4f]
 // CHECK: mul	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0x88,0x96,0x4f]
 
+        fmul v0.4h, v1.4h, v2.h[2]
+        fmul v0.8h, v1.8h, v2.h[2]
         fmul v0.2s, v1.2s, v2.s[2]
         fmul v0.2s, v1.2s, v22.s[2]
         fmul v0.4s, v1.4s, v2.s[2]
@@ -179,6 +189,8 @@
         fmul v0.2d, v1.2d, v2.d[1]
         fmul v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmul    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x0f]
+// CHECK: fmul    v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x4f]
 // CHECK: fmul	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x0f]
 // CHECK: fmul	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x0f]
 // CHECK: fmul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x4f]
@@ -186,6 +198,8 @@
 // CHECK: fmul	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x98,0xc2,0x4f]
 // CHECK: fmul	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x98,0xd6,0x4f]
 
+        fmulx v0.4h, v1.4h, v2.h[2]
+        fmulx v0.8h, v1.8h, v2.h[2]
         fmulx v0.2s, v1.2s, v2.s[2]
         fmulx v0.2s, v1.2s, v22.s[2]
         fmulx v0.4s, v1.4s, v2.s[2]
@@ -193,6 +207,8 @@
         fmulx v0.2d, v1.2d, v2.d[1]
         fmulx v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmulx   v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x2f]
+// CHECK: fmulx   v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x6f]
 // CHECK: fmulx	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x2f]
 // CHECK: fmulx	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x2f]
 // CHECK: fmulx	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x6f]
diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s
index 178eb26f64c2..b3a90bb14895 100644
--- a/test/MC/AArch64/neon-aba-abd.s
+++ b/test/MC/AArch64/neon-aba-abd.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -68,10 +68,12 @@
 //----------------------------------------------------------------------
 // Vector Absolute Difference (Floating Point)
 //----------------------------------------------------------------------
+         fabd v0.4h, v1.4h, v2.4h
          fabd v0.2s, v1.2s, v2.2s
          fabd v31.4s, v15.4s, v16.4s
          fabd v7.2d, v8.2d, v25.2d
 
+// CHECK: fabd    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x14,0xc2,0x2e]
 // CHECK: fabd v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xd4,0xa2,0x2e]
 // CHECK: fabd v31.4s, v15.4s, v16.4s // encoding: [0xff,0xd5,0xb0,0x6e]
 // CHECK: fabd v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xd5,0xf9,0x6e]
diff --git a/test/MC/AArch64/neon-across.s b/test/MC/AArch64/neon-across.s
index 60b766d8c881..74edc519a475 100644
--- a/test/MC/AArch64/neon-across.s
+++ b/test/MC/AArch64/neon-across.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -90,11 +90,27 @@
 // CHECK: addv	h0, v1.8h               // encoding: [0x20,0xb8,0x71,0x4e]
 // CHECK: addv	s0, v1.4s               // encoding: [0x20,0xb8,0xb1,0x4e]
 
+        fmaxnmv h0, v1.4h
+        fminnmv h0, v1.4h
+        fmaxv h0, v1.4h
+        fminv h0, v1.4h
+        fmaxnmv h0, v1.8h
+        fminnmv h0, v1.8h
+        fmaxv h0, v1.8h
+        fminv h0, v1.8h
         fmaxnmv s0, v1.4s
         fminnmv s0, v1.4s
         fmaxv s0, v1.4s
         fminv s0, v1.4s
 
+// CHECK: fmaxnmv h0, v1.4h               // encoding: [0x20,0xc8,0x30,0x0e]
+// CHECK: fminnmv h0, v1.4h               // encoding: [0x20,0xc8,0xb0,0x0e]
+// CHECK: fmaxv   h0, v1.4h               // encoding: [0x20,0xf8,0x30,0x0e]
+// CHECK: fminv   h0, v1.4h               // encoding: [0x20,0xf8,0xb0,0x0e]
+// CHECK: fmaxnmv h0, v1.8h               // encoding: [0x20,0xc8,0x30,0x4e]
+// CHECK: fminnmv h0, v1.8h               // encoding: [0x20,0xc8,0xb0,0x4e]
+// CHECK: fmaxv   h0, v1.8h               // encoding: [0x20,0xf8,0x30,0x4e]
+// CHECK: fminv   h0, v1.8h               // encoding: [0x20,0xf8,0xb0,0x4e]
 // CHECK: fmaxnmv	s0, v1.4s               // encoding: [0x20,0xc8,0x30,0x6e]
 // CHECK: fminnmv	s0, v1.4s               // encoding: [0x20,0xc8,0xb0,0x6e]
 // CHECK: fmaxv	s0, v1.4s               // encoding: [0x20,0xf8,0x30,0x6e]
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
index df9938b07e52..3d77c6e2790f 100644
--- a/test/MC/AArch64/neon-add-pairwise.s
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -25,10 +25,14 @@
 //------------------------------------------------------------------------------
 // Vector Add Pairwise (Floating Point
 //------------------------------------------------------------------------------
+         faddp v0.4h, v1.4h, v2.4h
+         faddp v0.8h, v1.8h, v2.8h
          faddp v0.2s, v1.2s, v2.2s
          faddp v0.4s, v1.4s, v2.4s
          faddp v0.2d, v1.2d, v2.2d
 
+// CHECK: faddp   v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x14,0x42,0x2e]
+// CHECK: faddp   v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x14,0x42,0x6e]
 // CHECK: faddp v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x2e]
 // CHECK: faddp v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x6e]
 // CHECK: faddp v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x6e]
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
index 68f169b3dd90..0d8416537022 100644
--- a/test/MC/AArch64/neon-add-sub-instructions.s
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -44,10 +44,14 @@
 //------------------------------------------------------------------------------
 // Vector Floating-Point Add
 //------------------------------------------------------------------------------
+         fadd v0.4h, v1.4h, v2.4h
+         fadd v0.8h, v1.8h, v2.8h
          fadd v0.2s, v1.2s, v2.2s
          fadd v0.4s, v1.4s, v2.4s
          fadd v0.2d, v1.2d, v2.2d
 
+// CHECK: fadd v0.4h, v1.4h, v2.4h       // encoding: [0x20,0x14,0x42,0x0e]
+// CHECK: fadd v0.8h, v1.8h, v2.8h       // encoding: [0x20,0x14,0x42,0x4e]
 // CHECK: fadd v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x0e]
 // CHECK: fadd v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x4e]
 // CHECK: fadd v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x4e]
@@ -56,10 +60,14 @@
 //------------------------------------------------------------------------------
 // Vector Floating-Point Sub
 //------------------------------------------------------------------------------
+         fsub v0.4h, v1.4h, v2.4h
+         fsub v0.8h, v1.8h, v2.8h
          fsub v0.2s, v1.2s, v2.2s
          fsub v0.4s, v1.4s, v2.4s
          fsub v0.2d, v1.2d, v2.2d
 
+// CHECK: fsub v0.4h, v1.4h, v2.4h       // encoding: [0x20,0x14,0xc2,0x0e]
+// CHECK; fsub v0.8h, v1.8h, v2.8h       // encoding: [0x20,0x14,0xc2,0x4e]
 // CHECK: fsub v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0xa2,0x0e]
 // CHECK: fsub v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0xa2,0x4e]
 // CHECK: fsub v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
index 19cfaf1f4d36..ffa88e50e0ce 100644
--- a/test/MC/AArch64/neon-compare-instructions.s
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -194,10 +194,14 @@
 // Vector Compare Mask Equal (Floating Point)
 //----------------------------------------------------------------------
 
+         fcmeq v0.4h, v31.4h, v16.4h
+         fcmeq v4.8h, v7.8h, v15.8h
          fcmeq v0.2s, v31.2s, v16.2s
          fcmeq v4.4s, v7.4s, v15.4s
          fcmeq v29.2d, v2.2d, v5.2d
 
+// CHECK: fcmeq   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0x50,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0x4f,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0x30,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xe4,0x2f,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xe4,0x65,0x4e]
@@ -208,6 +212,10 @@
 // FCMLE is alias for FCMGE with operands reversed.
 //----------------------------------------------------------------------
 
+         fcmge v3.4h, v8.4h, v12.4h
+         fcmge v31.8h, v29.8h, v28.8h
+         fcmle v3.4h,  v12.4h, v8.4h
+         fcmle v31.8h, v28.8h, v29.8h
          fcmge v31.4s, v29.4s, v28.4s
          fcmge v3.2s, v8.2s, v12.2s
          fcmge v17.2d, v15.2d, v13.2d
@@ -215,6 +223,10 @@
          fcmle v3.2s,  v12.2s, v8.2s
          fcmle v17.2d, v13.2d, v15.2d
 
+// CHECK: fcmge   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x25,0x4c,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x27,0x5c,0x6e]
+// CHECK: fcmge   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x25,0x4c,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x27,0x5c,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xe7,0x3c,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xe5,0x2c,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xe5,0x6d,0x6e]
@@ -228,6 +240,10 @@
 // FCMLT is alias for FCMGT with operands reversed.
 //----------------------------------------------------------------------
 
+         fcmgt v0.4h, v31.4h, v16.4h
+         fcmgt v4.8h, v7.8h, v15.8h
+         fcmlt v0.4h, v16.4h, v31.4h
+         fcmlt v4.8h, v15.8h, v7.8h
          fcmgt v0.2s, v31.2s, v16.2s
          fcmgt v4.4s, v7.4s, v15.4s
          fcmgt v29.2d, v2.2d, v5.2d
@@ -235,6 +251,10 @@
          fcmlt v4.4s, v15.4s, v7.4s
          fcmlt v29.2d, v5.2d, v2.2d
 
+// CHECK: fcmgt   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0xd0,0x2e]
+// CHECK: fcmgt   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0xcf,0x6e]
+// CHECK: fcmgt   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0xd0,0x2e]
+// CHECK: fcmgt   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0xcf,0x6e]
 // CHECK: fcmgt v0.2s, v31.2s, v16.2s  // encoding: [0xe0,0xe7,0xb0,0x2e]
 // CHECK: fcmgt v4.4s, v7.4s, v15.4s   // encoding: [0xe4,0xe4,0xaf,0x6e]
 // CHECK: fcmgt v29.2d, v2.2d, v5.2d   // encoding: [0x5d,0xe4,0xe5,0x6e]
@@ -343,16 +363,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Equal to Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmeq v0.4h, v31.4h, #0.0
+         fcmeq v4.8h, v7.8h, #0.0
          fcmeq v0.2s, v31.2s, #0.0
          fcmeq v4.4s, v7.4s, #0.0
          fcmeq v29.2d, v2.2d, #0.0
+         fcmeq v0.4h, v31.4h, #0
+         fcmeq v4.8h, v7.8h, #0
          fcmeq v0.2s, v31.2s, #0
          fcmeq v4.4s, v7.4s, #0
          fcmeq v29.2d, v2.2d, #0
 
+// CHECK: fcmeq   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xdb,0xf8,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xd8,0xf8,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, #0.0  // encoding: [0xe0,0xdb,0xa0,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, #0.0   // encoding: [0xe4,0xd8,0xa0,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, #0.0  // encoding: [0x5d,0xd8,0xe0,0x4e]
+// CHECK: fcmeq   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xdb,0xf8,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xd8,0xf8,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, #0.0  // encoding: [0xe0,0xdb,0xa0,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, #0.0   // encoding: [0xe4,0xd8,0xa0,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, #0.0  // encoding: [0x5d,0xd8,0xe0,0x4e]
@@ -360,16 +388,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmge v3.4h, v8.4h, #0.0
+         fcmge v31.8h, v29.8h, #0.0
          fcmge v31.4s, v29.4s, #0.0
          fcmge v3.2s, v8.2s, #0.0
          fcmge v17.2d, v15.2d, #0.0
+         fcmge v3.4h, v8.4h, #0
+         fcmge v31.8h, v29.8h, #0
          fcmge v31.4s, v29.4s, #0
          fcmge v3.2s, v8.2s, #0
          fcmge v17.2d, v15.2d, #0
 
+// CHECK: fcmge   v3.4h, v8.4h, #0.0      // encoding: [0x03,0xc9,0xf8,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, #0.0    // encoding: [0xbf,0xcb,0xf8,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, #0.0  // encoding: [0xbf,0xcb,0xa0,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, #0.0    // encoding: [0x03,0xc9,0xa0,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, #0.0   // encoding: [0xf1,0xc9,0xe0,0x6e]
+// CHECK: fcmge   v3.4h, v8.4h, #0.0      // encoding: [0x03,0xc9,0xf8,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, #0.0    // encoding: [0xbf,0xcb,0xf8,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, #0.0  // encoding: [0xbf,0xcb,0xa0,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, #0.0    // encoding: [0x03,0xc9,0xa0,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, #0.0   // encoding: [0xf1,0xc9,0xe0,0x6e]
@@ -377,16 +413,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmgt v0.4h, v31.4h, #0.0
+         fcmgt v4.8h, v7.8h, #0.0
          fcmgt v0.2s, v31.2s, #0.0
          fcmgt v4.4s, v7.4s, #0.0
          fcmgt v29.2d, v2.2d, #0.0
+         fcmgt v0.4h, v31.4h, #0
+         fcmgt v4.8h, v7.8h, #0
          fcmgt v0.2s, v31.2s, #0
          fcmgt v4.4s, v7.4s, #0
          fcmgt v29.2d, v2.2d, #0
 
+// CHECK: fcmgt   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xcb,0xf8,0x0e]
+// CHECK: fcmgt   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xc8,0xf8,0x4e]
 // CHECK: fcmgt v0.2s, v31.2s, #0.0   // encoding: [0xe0,0xcb,0xa0,0x0e]
 // CHECK: fcmgt v4.4s, v7.4s, #0.0    // encoding: [0xe4,0xc8,0xa0,0x4e]
 // CHECK: fcmgt v29.2d, v2.2d, #0.0   // encoding: [0x5d,0xc8,0xe0,0x4e]
+// CHECK: fcmgt   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xcb,0xf8,0x0e]
+// CHECK: fcmgt   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xc8,0xf8,0x4e]
 // CHECK: fcmgt v0.2s, v31.2s, #0.0   // encoding: [0xe0,0xcb,0xa0,0x0e]
 // CHECK: fcmgt v4.4s, v7.4s, #0.0    // encoding: [0xe4,0xc8,0xa0,0x4e]
 // CHECK: fcmgt v29.2d, v2.2d, #0.0   // encoding: [0x5d,0xc8,0xe0,0x4e]
@@ -394,16 +438,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than or Equal To Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmle v3.4h, v20.4h, #0.0
+         fcmle v1.8h, v8.8h, #0.0
          fcmle v1.4s, v8.4s, #0.0
          fcmle v3.2s, v20.2s, #0.0
          fcmle v7.2d, v13.2d, #0.0
+         fcmle v3.4h, v20.4h, #0
+         fcmle v1.8h, v8.8h, #0
          fcmle v1.4s, v8.4s, #0
          fcmle v3.2s, v20.2s, #0
          fcmle v7.2d, v13.2d, #0
 
+// CHECK: fcmle   v3.4h, v20.4h, #0.0     // encoding: [0x83,0xda,0xf8,0x2e]
+// CHECK: fcmle   v1.8h, v8.8h, #0.0      // encoding: [0x01,0xd9,0xf8,0x6e]
 // CHECK: fcmle v1.4s, v8.4s, #0.0   // encoding: [0x01,0xd9,0xa0,0x6e]
 // CHECK: fcmle v3.2s, v20.2s, #0.0  // encoding: [0x83,0xda,0xa0,0x2e]
 // CHECK: fcmle v7.2d, v13.2d, #0.0  // encoding: [0xa7,0xd9,0xe0,0x6e]
+// CHECK: fcmle   v3.4h, v20.4h, #0.0     // encoding: [0x83,0xda,0xf8,0x2e]
+// CHECK: fcmle   v1.8h, v8.8h, #0.0      // encoding: [0x01,0xd9,0xf8,0x6e]
 // CHECK: fcmle v1.4s, v8.4s, #0.0   // encoding: [0x01,0xd9,0xa0,0x6e]
 // CHECK: fcmle v3.2s, v20.2s, #0.0  // encoding: [0x83,0xda,0xa0,0x2e]
 // CHECK: fcmle v7.2d, v13.2d, #0.0  // encoding: [0xa7,0xd9,0xe0,0x6e]
@@ -411,16 +463,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmlt v16.4h, v2.4h, #0.0
+         fcmlt v15.8h, v4.8h, #0.0
          fcmlt v16.2s, v2.2s, #0.0
          fcmlt v15.4s, v4.4s, #0.0
          fcmlt v5.2d, v29.2d, #0.0
+         fcmlt v16.4h, v2.4h, #0
+         fcmlt v15.8h, v4.8h, #0
          fcmlt v16.2s, v2.2s, #0
          fcmlt v15.4s, v4.4s, #0
          fcmlt v5.2d, v29.2d, #0
 
+// CHECK: fcmlt   v16.4h, v2.4h, #0.0     // encoding: [0x50,0xe8,0xf8,0x0e]
+// CHECK: fcmlt   v15.8h, v4.8h, #0.0     // encoding: [0x8f,0xe8,0xf8,0x4e]
 // CHECK: fcmlt v16.2s, v2.2s, #0.0   // encoding: [0x50,0xe8,0xa0,0x0e]
 // CHECK: fcmlt v15.4s, v4.4s, #0.0   // encoding: [0x8f,0xe8,0xa0,0x4e]
 // CHECK: fcmlt v5.2d, v29.2d, #0.0   // encoding: [0xa5,0xeb,0xe0,0x4e]
+// CHECK: fcmlt   v16.4h, v2.4h, #0.0     // encoding: [0x50,0xe8,0xf8,0x0e]
+// CHECK: fcmlt   v15.8h, v4.8h, #0.0     // encoding: [0x8f,0xe8,0xf8,0x4e]
 // CHECK: fcmlt v16.2s, v2.2s, #0.0   // encoding: [0x50,0xe8,0xa0,0x0e]
 // CHECK: fcmlt v15.4s, v4.4s, #0.0   // encoding: [0x8f,0xe8,0xa0,0x4e]
 // CHECK: fcmlt v5.2d, v29.2d, #0.0   // encoding: [0xa5,0xeb,0xe0,0x4e]
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 973acb8249ad..6ded6e40bfb9 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -341,7 +341,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabd v0.2s, v1.4s, v2.2d
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabd v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                 ^
 //----------------------------------------------------------------------
@@ -385,7 +385,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        frecps v0.4s, v1.2d, v2.4s
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        frecps v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                  ^
 
@@ -400,7 +400,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        frsqrts v0.2d, v1.2d, v2.2s
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        frsqrts v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                   ^
 
@@ -417,7 +417,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        facge v0.2d, v1.2s, v2.2d
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        facge v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -435,7 +435,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        facgt v0.2d, v1.2d, v2.4s
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        facgt v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -1092,7 +1092,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmin v0.4s, v1.4s, v2.2d
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmin v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                ^
 
@@ -1177,7 +1177,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fminp v0.4s, v1.4s, v2.2d
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminp v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                 ^
 
@@ -1283,7 +1283,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fmulx v21.2s, v5.2s, v13.2d
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fmulx v1.4h, v25.4h, v3.4h
 // CHECK-ERROR:                  ^
 
@@ -3023,10 +3023,10 @@
       fmla v0.2d, v1.2d, v2.d[2]
       fmla v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmla v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3057,10 +3057,10 @@
       fmls v0.2d, v1.2d, v2.d[2]
       fmls v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmls v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3428,7 +3428,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mul v0.2d, v1.2d, v2.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3458,7 +3458,7 @@
       fmulx v0.2d, v1.2d, v2.d[2]
       fmulx v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3837,16 +3837,16 @@
         fmaxv h0, v1.8h
         fminv h0, v1.8h
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmaxnmv h0, v1.8h
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminnmv h0, v1.8h
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmaxv h0, v1.8h
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminv h0, v1.8h
 // CHECK-ERROR:              ^
 
@@ -5594,13 +5594,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabs v0.16b, v31.16b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabs v2.8h, v4.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabs v1.8b, v9.8b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabs v13.4h, v21.4h
 // CHECK-ERROR:                  ^
 
@@ -5616,13 +5616,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fneg v0.16b, v31.16b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fneg v2.8h, v4.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fneg v1.8b, v9.8b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fneg v13.4h, v21.4h
 // CHECK-ERROR:                  ^
 
@@ -5978,205 +5978,205 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintn v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintn v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintn v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintn v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinta v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinta v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinta v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinta v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintp v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintp v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintp v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintp v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintm v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintm v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintm v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintm v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintx v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintx v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintx v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintx v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintz v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintz v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintz v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintz v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinti v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinti v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinti v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinti v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtns v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtns v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtns v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtns v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtnu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtnu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtnu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtnu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtps v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtps v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtps v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtps v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtpu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtpu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtpu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtpu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtms v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtms v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtms v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtms v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtmu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtmu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtmu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtmu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzs v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzs v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzs v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzs v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtas v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtas v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtas v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtas v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtau v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtau v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtau v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtau v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6212,61 +6212,61 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         scvtf v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         scvtf v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         scvtf v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         scvtf v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ucvtf v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         ucvtf v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ucvtf v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         ucvtf v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frecpe v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frecpe v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frecpe v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frecpe v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frsqrte v0.16b, v31.16b
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frsqrte v2.8h, v4.8h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frsqrte v1.8b, v9.8b
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frsqrte v13.4h, v21.4h
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fsqrt v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fsqrt v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fsqrt v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fsqrt v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 
diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s
index 212eda2f2092..9c10caa0f7c2 100644
--- a/test/MC/AArch64/neon-facge-facgt.s
+++ b/test/MC/AArch64/neon-facge-facgt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,16 +6,24 @@
 // Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
 // FACLE is alias for FACGE with operands reversed
 //----------------------------------------------------------------------
+         facge v0.4h, v31.4h, v16.4h
+         facge v4.8h, v7.8h, v15.8h
          facge v0.2s, v31.2s, v16.2s
          facge v4.4s, v7.4s, v15.4s
          facge v29.2d, v2.2d, v5.2d
+         facle v0.4h, v16.4h, v31.4h
+         facle v4.8h, v15.8h, v7.8h
          facle v0.2s, v16.2s, v31.2s
          facle v4.4s, v15.4s, v7.4s
          facle v29.2d, v5.2d, v2.2d
 
+// CHECK: facge   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x2f,0x50,0x2e]
+// CHECK: facge   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x2c,0x4f,0x6e]
 // CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
 // CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
 // CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
+// CHECK: facge   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x2f,0x50,0x2e]
+// CHECK: facge   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x2c,0x4f,0x6e]
 // CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
 // CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
 // CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
@@ -24,16 +32,24 @@
 // Vector Absolute Compare Mask Less Than (Floating Point)
 // FACLT is alias for FACGT with operands reversed
 //----------------------------------------------------------------------
+         facgt v3.4h, v8.4h, v12.4h
+         facgt v31.8h, v29.8h, v28.8h
          facgt v31.4s, v29.4s, v28.4s
          facgt v3.2s, v8.2s, v12.2s
          facgt v17.2d, v15.2d, v13.2d
+         faclt v3.4h,  v12.4h, v8.4h
+         faclt v31.8h, v28.8h, v29.8h
          faclt v31.4s, v28.4s, v29.4s
          faclt v3.2s,  v12.2s, v8.2s
          faclt v17.2d, v13.2d, v15.2d
 
+// CHECK: facgt   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x2d,0xcc,0x2e]
+// CHECK: facgt   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x2f,0xdc,0x6e]
 // CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
 // CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
 // CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
+// CHECK: facgt   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x2d,0xcc,0x2e]
+// CHECK: facgt   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x2f,0xdc,0x6e]
 // CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
 // CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
 // CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s
index 79fe5da5e76f..67a1340ecc32 100644
--- a/test/MC/AArch64/neon-frsqrt-frecp.s
+++ b/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -1,14 +1,18 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
 //----------------------------------------------------------------------
 // Vector Reciprocal Square Root Step (Floating Point)
 //----------------------------------------------------------------------
+         frsqrts v0.4h, v31.4h, v16.4h
+         frsqrts v4.8h, v7.8h, v15.8h
          frsqrts v0.2s, v31.2s, v16.2s
          frsqrts v4.4s, v7.4s, v15.4s
          frsqrts v29.2d, v2.2d, v5.2d
 
+// CHECK: frsqrts v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x3f,0xd0,0x0e]
+// CHECK: frsqrts v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x3c,0xcf,0x4e]
 // CHECK: frsqrts v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xff,0xb0,0x0e]
 // CHECK: frsqrts v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xfc,0xaf,0x4e]
 // CHECK: frsqrts v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xfc,0xe5,0x4e]
@@ -16,10 +20,14 @@
 //----------------------------------------------------------------------
 // Vector Reciprocal Step (Floating Point)
 //----------------------------------------------------------------------
+         frecps v3.4h, v8.4h, v12.4h
+         frecps v31.8h, v29.8h, v28.8h
          frecps v31.4s, v29.4s, v28.4s
          frecps v3.2s, v8.2s, v12.2s
          frecps v17.2d, v15.2d, v13.2d
 
+// CHECK: frecps  v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x3d,0x4c,0x0e]
+// CHECK: frecps  v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x3f,0x5c,0x4e]
 // CHECK: frecps v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xff,0x3c,0x4e]
 // CHECK: frecps v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xfd,0x2c,0x0e]
 // CHECK: frecps v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xfd,0x6d,0x4e]
diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s
index 8d2dadb1997f..27cf4c8d830a 100644
--- a/test/MC/AArch64/neon-max-min-pairwise.s
+++ b/test/MC/AArch64/neon-max-min-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -67,10 +67,14 @@
 //----------------------------------------------------------------------
 // Vector Maximum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fmaxp v0.4h, v1.4h, v2.4h
+         fmaxp v31.8h, v15.8h, v16.8h
          fmaxp v0.2s, v1.2s, v2.2s
          fmaxp v31.4s, v15.4s, v16.4s
          fmaxp v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxp   v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x34,0x42,0x2e]
+// CHECK: fmaxp   v31.8h, v15.8h, v16.8h  // encoding: [0xff,0x35,0x50,0x6e]
 // CHECK: fmaxp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x2e]
 // CHECK: fmaxp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x6e]
 // CHECK: fmaxp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x6e]
@@ -78,10 +82,14 @@
 //----------------------------------------------------------------------
 // Vector Minimum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fminp v10.4h, v15.4h, v22.4h
+         fminp v3.8h, v5.8h, v6.8h
          fminp v10.2s, v15.2s, v22.2s
          fminp v3.4s, v5.4s, v6.4s
          fminp v17.2d, v13.2d, v2.2d
 
+// CHECK: fminp   v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x35,0xd6,0x2e]
+// CHECK: fminp   v3.8h, v5.8h, v6.8h     // encoding: [0xa3,0x34,0xc6,0x6e]
 // CHECK: fminp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x2e]
 // CHECK: fminp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x6e]
 // CHECK: fminp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x6e]
@@ -89,10 +97,14 @@
 //----------------------------------------------------------------------
 // Vector maxNum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fmaxnmp v0.4h, v1.4h, v2.4h
+         fmaxnmp v31.8h, v15.8h, v16.8h
          fmaxnmp v0.2s, v1.2s, v2.2s
          fmaxnmp v31.4s, v15.4s, v16.4s
          fmaxnmp v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxnmp v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x04,0x42,0x2e]
+// CHECK: fmaxnmp v31.8h, v15.8h, v16.8h  // encoding: [0xff,0x05,0x50,0x6e]
 // CHECK: fmaxnmp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x2e]
 // CHECK: fmaxnmp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x6e]
 // CHECK: fmaxnmp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x6e]
@@ -100,10 +112,14 @@
 //----------------------------------------------------------------------
 // Vector minNum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fminnmp v10.4h, v15.4h, v22.4h
+         fminnmp v3.8h, v5.8h, v6.8h
          fminnmp v10.2s, v15.2s, v22.2s
          fminnmp v3.4s, v5.4s, v6.4s
          fminnmp v17.2d, v13.2d, v2.2d
 
+// CHECK: fminnmp v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x05,0xd6,0x2e]
+// CHECK: fminnmp v3.8h, v5.8h, v6.8h     // encoding: [0xa3,0x04,0xc6,0x6e]
 // CHECK: fminnmp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x2e]
 // CHECK: fminnmp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x6e]
 // CHECK: fminnmp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x6e]
diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s
index 6d1efde5077f..c4bd74d98882 100644
--- a/test/MC/AArch64/neon-max-min.s
+++ b/test/MC/AArch64/neon-max-min.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -67,10 +67,14 @@
 //----------------------------------------------------------------------
 // Vector Maximum (Floating Point)
 //----------------------------------------------------------------------
+         fmax v0.4h, v1.4h, v2.4h
+         fmax v0.8h, v1.8h, v2.8h
          fmax v0.2s, v1.2s, v2.2s
          fmax v31.4s, v15.4s, v16.4s
          fmax v7.2d, v8.2d, v25.2d
 
+// CHECK: fmax    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x34,0x42,0x0e]
+// CHECK: fmax    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x34,0x42,0x4e]
 // CHECK: fmax v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x0e]
 // CHECK: fmax v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x4e]
 // CHECK: fmax v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x4e]
@@ -78,10 +82,14 @@
 //----------------------------------------------------------------------
 // Vector Minimum (Floating Point)
 //----------------------------------------------------------------------
+         fmin v10.4h, v15.4h, v22.4h
+         fmin v10.8h, v15.8h, v22.8h
          fmin v10.2s, v15.2s, v22.2s
          fmin v3.4s, v5.4s, v6.4s
          fmin v17.2d, v13.2d, v2.2d
 
+// CHECK: fmin    v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x35,0xd6,0x0e]
+// CHECK: fmin    v10.8h, v15.8h, v22.8h  // encoding: [0xea,0x35,0xd6,0x4e]
 // CHECK: fmin v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x0e]
 // CHECK: fmin v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x4e]
 // CHECK: fmin v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x4e]
@@ -89,10 +97,14 @@
 //----------------------------------------------------------------------
 // Vector maxNum (Floating Point)
 //----------------------------------------------------------------------
+         fmaxnm v0.4h, v1.4h, v2.4h
+         fmaxnm v0.8h, v1.8h, v2.8h
          fmaxnm v0.2s, v1.2s, v2.2s
          fmaxnm v31.4s, v15.4s, v16.4s
          fmaxnm v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxnm  v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x04,0x42,0x0e]
+// CHECK: fmaxnm  v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x04,0x42,0x4e]
 // CHECK: fmaxnm v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x0e]
 // CHECK: fmaxnm v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x4e]
 // CHECK: fmaxnm v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x4e]
@@ -100,10 +112,14 @@
 //----------------------------------------------------------------------
 // Vector minNum (Floating Point)
 //----------------------------------------------------------------------
+         fminnm v10.4h, v15.4h, v22.4h
+         fminnm v10.8h, v15.8h, v22.8h
          fminnm v10.2s, v15.2s, v22.2s
          fminnm v3.4s, v5.4s, v6.4s
          fminnm v17.2d, v13.2d, v2.2d
 
+// CHECK: fminnm  v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x05,0xd6,0x0e]
+// CHECK: fminnm  v10.8h, v15.8h, v22.8h  // encoding: [0xea,0x05,0xd6,0x4e]
 // CHECK: fminnm v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x0e]
 // CHECK: fminnm v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x4e]
 // CHECK: fminnm v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s
index 3072e6f1200d..a510fc8c7b91 100644
--- a/test/MC/AArch64/neon-mla-mls-instructions.s
+++ b/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -40,10 +40,14 @@
 //----------------------------------------------------------------------
 // Vector Floating-Point Multiply-accumulate
 //----------------------------------------------------------------------
+         fmla v0.4h, v1.4h, v2.4h
+         fmla v0.8h, v1.8h, v2.8h
          fmla v0.2s, v1.2s, v2.2s
          fmla v0.4s, v1.4s, v2.4s
          fmla v0.2d, v1.2d, v2.2d
 
+// CHECK: fmla    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x0c,0x42,0x0e]
+// CHECK: fmla    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x0c,0x42,0x4e]
 // CHECK: fmla v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0x22,0x0e]
 // CHECK: fmla v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0x22,0x4e]
 // CHECK: fmla v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0x62,0x4e]
@@ -51,10 +55,14 @@
 //----------------------------------------------------------------------
 // Vector Floating-Point Multiply-subtract
 //----------------------------------------------------------------------
+         fmls v0.4h, v1.4h, v2.4h
+         fmls v0.8h, v1.8h, v2.8h
          fmls v0.2s, v1.2s, v2.2s
          fmls v0.4s, v1.4s, v2.4s
          fmls v0.2d, v1.2d, v2.2d
 
+// CHECK: fmls    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x0c,0xc2,0x0e]
+// CHECK: fmls    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x0c,0xc2,0x4e]
 // CHECK: fmls v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0xa2,0x0e]
 // CHECK: fmls v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0xa2,0x4e]
 // CHECK: fmls v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-scalar-abs.s b/test/MC/AArch64/neon-scalar-abs.s
index d08756c0c10c..71130617848f 100644
--- a/test/MC/AArch64/neon-scalar-abs.s
+++ b/test/MC/AArch64/neon-scalar-abs.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -14,9 +14,11 @@
 // Scalar Floating-point Absolute Difference
 //----------------------------------------------------------------------
 
+    fabd h29, h24, h20
     fabd s29, s24, s20
     fabd d29, d24, d20
 
+// CHECK: fabd    h29, h24, h20           // encoding: [0x1d,0x17,0xd4,0x7e]
 // CHECK: fabd s29, s24, s20  // encoding: [0x1d,0xd7,0xb4,0x7e]
 // CHECK: fabd d29, d24, d20  // encoding: [0x1d,0xd7,0xf4,0x7e]
 
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
index fec9d12d8b8d..394fda673e20 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -1,8 +1,9 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point fused multiply-add (scalar, by element)
 //------------------------------------------------------------------------------
+    fmla    h0, h1, v1.h[5]
     fmla    s0, s1, v1.s[0]
     fmla    s30, s11, v1.s[1]
     fmla    s4, s5, v7.s[2]
@@ -10,6 +11,7 @@
     fmla    d0, d1, v1.d[0]
     fmla    d30, d11, v1.d[1]
 
+// CHECK: fmla    h0, h1, v1.h[5]         // encoding: [0x20,0x18,0x11,0x5f]
 // CHECK: fmla    s0, s1, v1.s[0]         // encoding: [0x20,0x10,0x81,0x5f]
 // CHECK: fmla    s30, s11, v1.s[1]       // encoding: [0x7e,0x11,0xa1,0x5f]
 // CHECK: fmla    s4, s5, v7.s[2]         // encoding: [0xa4,0x18,0x87,0x5f]
@@ -21,6 +23,7 @@
 // Floating Point fused multiply-subtract (scalar, by element)
 //------------------------------------------------------------------------------
 
+    fmls    h2, h3, v4.h[5]
     fmls    s2, s3, v4.s[0]
     fmls    s29, s10, v28.s[1]      
     fmls    s5, s12, v23.s[2]       
@@ -28,6 +31,7 @@
     fmls    d0, d1, v1.d[0]         
     fmls    d30, d11, v1.d[1]       
 
+// CHECK: fmls    h2, h3, v4.h[5]         // encoding: [0x62,0x58,0x14,0x5f]
 // CHECK: fmls    s2, s3, v4.s[0]     // encoding: [0x62,0x50,0x84,0x5f]
 // CHECK: fmls    s29, s10, v28.s[1]  // encoding: [0x5d,0x51,0xbc,0x5f]
 // CHECK: fmls    s5, s12, v23.s[2]   // encoding: [0x85,0x59,0x97,0x5f]
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
index 8b8a3f57a9ca..0d832742a389 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -1,8 +1,9 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point  multiply (scalar, by element)
 //------------------------------------------------------------------------------
+    fmul    h0, h1, v1.h[5]
     fmul    s0, s1, v1.s[0]
     fmul    s30, s11, v1.s[1]
     fmul    s4, s5, v7.s[2]
@@ -10,6 +11,7 @@
     fmul    d0, d1, v1.d[0]
     fmul    d30, d11, v1.d[1]
 
+// CHECK: fmul    h0, h1, v1.h[5]         // encoding: [0x20,0x98,0x11,0x5f]
 // CHECK: fmul    s0, s1, v1.s[0]      // encoding: [0x20,0x90,0x81,0x5f]
 // CHECK: fmul    s30, s11, v1.s[1]    // encoding: [0x7e,0x91,0xa1,0x5f]
 // CHECK: fmul    s4, s5, v7.s[2]      // encoding: [0xa4,0x98,0x87,0x5f]
@@ -21,6 +23,7 @@
 //------------------------------------------------------------------------------
 // Floating Point  multiply extended (scalar, by element)
 //------------------------------------------------------------------------------
+    fmulx   h6, h2, v8.h[5]
     fmulx   s6, s2, v8.s[0]
     fmulx   s7, s3, v13.s[1]
     fmulx   s9, s7, v9.s[2]
@@ -28,6 +31,7 @@
     fmulx   d15, d9, v7.d[0]
     fmulx   d13, d12, v11.d[1]
 
+// CHECK: fmulx   h6, h2, v8.h[5]         // encoding: [0x46,0x98,0x18,0x7f]
 // CHECK: fmulx   s6, s2, v8.s[0]         // encoding: [0x46,0x90,0x88,0x7f]
 // CHECK: fmulx   s7, s3, v13.s[1]        // encoding: [0x67,0x90,0xad,0x7f]
 // CHECK: fmulx   s9, s7, v9.s[2]         // encoding: [0xe9,0x98,0x89,0x7f]
diff --git a/test/MC/AArch64/neon-scalar-cvt.s b/test/MC/AArch64/neon-scalar-cvt.s
index 97416daf0801..3cbf6bae6758 100644
--- a/test/MC/AArch64/neon-scalar-cvt.s
+++ b/test/MC/AArch64/neon-scalar-cvt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Scalar Signed Integer Convert To Floating-point
 //----------------------------------------------------------------------
 
+    scvtf h23, h14
     scvtf s22, s13
     scvtf d21, d12
 
+// CHECK: scvtf   h23, h14                // encoding: [0xd7,0xd9,0x79,0x5e]
 // CHECK: scvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x5e]
 // CHECK: scvtf d21, d12    // encoding: [0x95,0xd9,0x61,0x5e]
 
@@ -16,9 +18,11 @@
 // Scalar Unsigned Integer Convert To Floating-point
 //----------------------------------------------------------------------
 
+    ucvtf h20, h12
     ucvtf s22, s13
     ucvtf d21, d14
 
+// CHECK: ucvtf   h20, h12                // encoding: [0x94,0xd9,0x79,0x7e]
 // CHECK: ucvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x7e]
 // CHECK: ucvtf d21, d14    // encoding: [0xd5,0xd9,0x61,0x7e]
 
@@ -26,9 +30,11 @@
 // Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
 //----------------------------------------------------------------------
 
+    scvtf h22, h13, #16
     scvtf s22, s13, #32
     scvtf d21, d12, #64
 
+// CHECK: scvtf   h22, h13, #16           // encoding: [0xb6,0xe5,0x10,0x5f]
 // CHECK: scvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x5f]
 // CHECK: scvtf d21, d12, #64  // encoding: [0x95,0xe5,0x40,0x5f]    
 
@@ -36,9 +42,11 @@
 // Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
 //----------------------------------------------------------------------
 
+    ucvtf h22, h13, #16
     ucvtf s22, s13, #32
     ucvtf d21, d14, #64
 
+// CHECK: ucvtf   h22, h13, #16           // encoding: [0xb6,0xe5,0x10,0x7f]
 // CHECK: ucvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x7f]
 // CHECK: ucvtf d21, d14, #64  // encoding: [0xd5,0xe5,0x40,0x7f]
 
@@ -46,9 +54,11 @@
 // Scalar Floating-point Convert To Signed Fixed-point (Immediate)
 //----------------------------------------------------------------------
 
+    fcvtzs h21, h12, #1
     fcvtzs s21, s12, #1
     fcvtzs d21, d12, #1
 
+// CHECK: fcvtzs  h21, h12, #1            // encoding: [0x95,0xfd,0x1f,0x5f]
 // CHECK: fcvtzs s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x5f]
 // CHECK: fcvtzs d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x5f]
         
@@ -56,9 +66,11 @@
 // Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
 //----------------------------------------------------------------------
 
+    fcvtzu h21, h12, #1
     fcvtzu s21, s12, #1
     fcvtzu d21, d12, #1
 
+// CHECK: fcvtzu  h21, h12, #1            // encoding: [0x95,0xfd,0x1f,0x7f]
 // CHECK: fcvtzu s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x7f]
 // CHECK: fcvtzu d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x7f]
 
@@ -76,9 +88,11 @@
 // With Ties To Away
 //----------------------------------------------------------------------
 
+    fcvtas h12, h13
     fcvtas s12, s13
     fcvtas d21, d14
 
+// CHECK: fcvtas  h12, h13                // encoding: [0xac,0xc9,0x79,0x5e]
 // CHECK: fcvtas s12, s13    // encoding: [0xac,0xc9,0x21,0x5e]
 // CHECK: fcvtas d21, d14    // encoding: [0xd5,0xc9,0x61,0x5e]
 
@@ -87,9 +101,11 @@
 // Nearest With Ties To Away
 //----------------------------------------------------------------------
 
+    fcvtau h12, h13
     fcvtau s12, s13
     fcvtau d21, d14
 
+// CHECK: fcvtau  h12, h13                // encoding: [0xac,0xc9,0x79,0x7e]
 // CHECK: fcvtau s12, s13    // encoding: [0xac,0xc9,0x21,0x7e]
 // CHECK: fcvtau d21, d14    // encoding: [0xd5,0xc9,0x61,0x7e]
 
@@ -98,9 +114,11 @@
 // Minus Infinity
 //----------------------------------------------------------------------
 
+    fcvtms h22, h13
     fcvtms s22, s13
     fcvtms d21, d14
 
+// CHECK: fcvtms  h22, h13                // encoding: [0xb6,0xb9,0x79,0x5e]
 // CHECK: fcvtms s22, s13    // encoding: [0xb6,0xb9,0x21,0x5e]
 // CHECK: fcvtms d21, d14    // encoding: [0xd5,0xb9,0x61,0x5e]
 
@@ -109,9 +127,11 @@
 // Minus Infinity
 //----------------------------------------------------------------------
 
+    fcvtmu h12, h13
     fcvtmu s12, s13
     fcvtmu d21, d14
 
+// CHECK: fcvtmu  h12, h13                // encoding: [0xac,0xb9,0x79,0x7e]
 // CHECK: fcvtmu s12, s13    // encoding: [0xac,0xb9,0x21,0x7e]
 // CHECK: fcvtmu d21, d14    // encoding: [0xd5,0xb9,0x61,0x7e]
 
@@ -120,9 +140,11 @@
 // With Ties To Even
 //----------------------------------------------------------------------
 
+    fcvtns h22, h13
     fcvtns s22, s13
     fcvtns d21, d14
 
+// CHECK: fcvtns  h22, h13                // encoding: [0xb6,0xa9,0x79,0x5e]
 // CHECK: fcvtns s22, s13    // encoding: [0xb6,0xa9,0x21,0x5e]
 // CHECK: fcvtns d21, d14    // encoding: [0xd5,0xa9,0x61,0x5e]
 
@@ -131,9 +153,11 @@
 // Nearest With Ties To Even
 //----------------------------------------------------------------------
 
+    fcvtnu h12, h13
     fcvtnu s12, s13
     fcvtnu d21, d14
 
+// CHECK: fcvtnu  h12, h13                // encoding: [0xac,0xa9,0x79,0x7e]
 // CHECK: fcvtnu s12, s13    // encoding: [0xac,0xa9,0x21,0x7e]
 // CHECK: fcvtnu d21, d14    // encoding: [0xd5,0xa9,0x61,0x7e]
         
@@ -142,9 +166,11 @@
 // Positive Infinity
 //----------------------------------------------------------------------
 
+    fcvtps h22, h13
     fcvtps s22, s13
     fcvtps d21, d14
 
+// CHECK: fcvtps  h22, h13                // encoding: [0xb6,0xa9,0xf9,0x5e]
 // CHECK: fcvtps s22, s13    // encoding: [0xb6,0xa9,0xa1,0x5e]
 // CHECK: fcvtps d21, d14    // encoding: [0xd5,0xa9,0xe1,0x5e]
         
@@ -153,9 +179,11 @@
 // Positive Infinity
 //----------------------------------------------------------------------
 
+    fcvtpu h12, h13
     fcvtpu s12, s13
     fcvtpu d21, d14
 
+// CHECK: fcvtpu  h12, h13                // encoding: [0xac,0xa9,0xf9,0x7e]
 // CHECK: fcvtpu s12, s13    // encoding: [0xac,0xa9,0xa1,0x7e]
 // CHECK: fcvtpu d21, d14    // encoding: [0xd5,0xa9,0xe1,0x7e]
 
@@ -163,9 +191,11 @@
 // Scalar Floating-point Convert To Signed Integer, Rounding Toward Zero
 //----------------------------------------------------------------------
 
+    fcvtzs h12, h13
     fcvtzs s12, s13
     fcvtzs d21, d14
 
+// CHECK: fcvtzs  h12, h13                // encoding: [0xac,0xb9,0xf9,0x5e]
 // CHECK: fcvtzs s12, s13    // encoding: [0xac,0xb9,0xa1,0x5e]
 // CHECK: fcvtzs d21, d14    // encoding: [0xd5,0xb9,0xe1,0x5e]
         
@@ -174,8 +204,10 @@
 // Zero
 //----------------------------------------------------------------------
 
+    fcvtzu h12, h13
     fcvtzu s12, s13
     fcvtzu d21, d14
 
+// CHECK: fcvtzu  h12, h13                // encoding: [0xac,0xb9,0xf9,0x7e]
 // CHECK: fcvtzu s12, s13    // encoding: [0xac,0xb9,0xa1,0x7e]
 // CHECK: fcvtzu d21, d14    // encoding: [0xd5,0xb9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-fp-compare.s b/test/MC/AArch64/neon-scalar-fp-compare.s
index b798b3410670..0b91d945a719 100644
--- a/test/MC/AArch64/neon-scalar-fp-compare.s
+++ b/test/MC/AArch64/neon-scalar-fp-compare.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Scalar Floating-point Compare Mask Equal
 //----------------------------------------------------------------------
 
+         fcmeq h10, h11, h12
          fcmeq s10, s11, s12
          fcmeq d20, d21, d22
 
+// CHECK: fcmeq   h10, h11, h12           // encoding: [0x6a,0x25,0x4c,0x5e]
 // CHECK: fcmeq s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x5e]
 // CHECK: fcmeq d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x5e]
 
@@ -16,13 +18,17 @@
 // Scalar Floating-point Compare Mask Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmeq h10, h11, #0.0
          fcmeq s10, s11, #0.0
          fcmeq d20, d21, #0.0
+         fcmeq h10, h11, #0
          fcmeq s10, s11, #0
          fcmeq d20, d21, #0x0
 
+// CHECK: fcmeq   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x5e]
 // CHECK: fcmeq s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x5e]
 // CHECK: fcmeq d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x5e]
+// CHECK: fcmeq   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x5e]
 // CHECK: fcmeq s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x5e]
 // CHECK: fcmeq d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x5e]
 
@@ -30,9 +36,11 @@
 // Scalar Floating-point Compare Mask Greater Than Or Equal
 //----------------------------------------------------------------------
 
+         fcmge h10, h11, h12
          fcmge s10, s11, s12
          fcmge d20, d21, d22
 
+// CHECK: fcmge   h10, h11, h12           // encoding: [0x6a,0x25,0x4c,0x7e]
 // CHECK: fcmge s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x7e]
 // CHECK: fcmge d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x7e]
 
@@ -40,13 +48,17 @@
 // Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmge h10, h11, #0.0
          fcmge s10, s11, #0.0
          fcmge d20, d21, #0.0
+         fcmge h10, h11, #0
          fcmge s10, s11, #0
          fcmge d20, d21, #0x0
 
+// CHECK: fcmge   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x7e]
 // CHECK: fcmge s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x7e]
 // CHECK: fcmge d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x7e]
+// CHECK: fcmge   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x7e]
 // CHECK: fcmge s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x7e]
 // CHECK: fcmge d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x7e]
 
@@ -54,9 +66,11 @@
 // Scalar Floating-point Compare Mask Greather Than
 //----------------------------------------------------------------------
 
+         fcmgt h10, h11, h12
          fcmgt s10, s11, s12
          fcmgt d20, d21, d22
 
+// CHECK: fcmgt   h10, h11, h12           // encoding: [0x6a,0x25,0xcc,0x7e]
 // CHECK: fcmgt s10, s11, s12   // encoding: [0x6a,0xe5,0xac,0x7e]
 // CHECK: fcmgt d20, d21, d22   // encoding: [0xb4,0xe6,0xf6,0x7e]
 
@@ -64,13 +78,17 @@
 // Scalar Floating-point Compare Mask Greather Than Zero
 //----------------------------------------------------------------------
 
+         fcmgt h10, h11, #0.0
          fcmgt s10, s11, #0.0
          fcmgt d20, d21, #0.0
+         fcmgt h10, h11, #0
          fcmgt s10, s11, #0
          fcmgt d20, d21, #0x0
 
+// CHECK: fcmgt   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x5e]
 // CHECK: fcmgt s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x5e]
 // CHECK: fcmgt d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x5e]
+// CHECK: fcmgt   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x5e]
 // CHECK: fcmgt s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x5e]
 // CHECK: fcmgt d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x5e]
 
@@ -78,13 +96,17 @@
 // Scalar Floating-point Compare Mask Less Than Or Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmle h10, h11, #0.0
          fcmle s10, s11, #0.0
          fcmle d20, d21, #0.0
+         fcmle h10, h11, #0
          fcmle s10, s11, #0
          fcmle d20, d21, #0x0
 
+// CHECK: fcmle   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x7e]
 // CHECK: fcmle s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x7e]
 // CHECK: fcmle d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x7e]
+// CHECK: fcmle   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x7e]
 // CHECK: fcmle s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x7e]
 // CHECK: fcmle d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x7e]
 
@@ -92,13 +114,17 @@
 // Scalar Floating-point Compare Mask Less Than
 //----------------------------------------------------------------------
 
+         fcmlt h10, h11, #0.0
          fcmlt s10, s11, #0.0
          fcmlt d20, d21, #0.0
+         fcmlt h10, h11, #0
          fcmlt s10, s11, #0
          fcmlt d20, d21, #0x0
 
+// CHECK: fcmlt   h10, h11, #0.0          // encoding: [0x6a,0xe9,0xf8,0x5e]
 // CHECK: fcmlt s10, s11, #0.0   // encoding: [0x6a,0xe9,0xa0,0x5e]
 // CHECK: fcmlt d20, d21, #0.0   // encoding: [0xb4,0xea,0xe0,0x5e]
+// CHECK: fcmlt   h10, h11, #0.0          // encoding: [0x6a,0xe9,0xf8,0x5e]
 // CHECK: fcmlt s10, s11, #0.0   // encoding: [0x6a,0xe9,0xa0,0x5e]
 // CHECK: fcmlt d20, d21, #0.0   // encoding: [0xb4,0xea,0xe0,0x5e]
 
@@ -106,9 +132,11 @@
 // Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
 //----------------------------------------------------------------------
 
+         facge h10, h11, h12
          facge s10, s11, s12
          facge d20, d21, d22
 
+// CHECK: facge   h10, h11, h12           // encoding: [0x6a,0x2d,0x4c,0x7e]
 // CHECK: facge s10, s11, s12    // encoding: [0x6a,0xed,0x2c,0x7e]
 // CHECK: facge d20, d21, d22    // encoding: [0xb4,0xee,0x76,0x7e]
 
@@ -116,8 +144,10 @@
 // Scalar Floating-point Absolute Compare Mask Greater Than
 //----------------------------------------------------------------------
 
+         facgt h10, h11, h12
          facgt s10, s11, s12
          facgt d20, d21, d22
 
+// CHECK: facgt   h10, h11, h12           // encoding: [0x6a,0x2d,0xcc,0x7e]
 // CHECK: facgt s10, s11, s12   // encoding: [0x6a,0xed,0xac,0x7e]
 // CHECK: facgt d20, d21, d22   // encoding: [0xb4,0xee,0xf6,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-mul.s b/test/MC/AArch64/neon-scalar-mul.s
index e33bdad91a94..323fad206c4d 100644
--- a/test/MC/AArch64/neon-scalar-mul.s
+++ b/test/MC/AArch64/neon-scalar-mul.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -26,9 +26,11 @@
 // Floating-point Multiply Extended
 //----------------------------------------------------------------------
 
+    fmulx h20, h22, h15
     fmulx s20, s22, s15
     fmulx d23, d11, d1
 
+// CHECK: fmulx   h20, h22, h15           // encoding: [0xd4,0x1e,0x4f,0x5e]
 // CHECK: fmulx s20, s22, s15   // encoding: [0xd4,0xde,0x2f,0x5e]
 // CHECK: fmulx d23, d11, d1    // encoding: [0x77,0xdd,0x61,0x5e]
 
diff --git a/test/MC/AArch64/neon-scalar-recip.s b/test/MC/AArch64/neon-scalar-recip.s
index 7a886f3b4a73..923c3549d6f0 100644
--- a/test/MC/AArch64/neon-scalar-recip.s
+++ b/test/MC/AArch64/neon-scalar-recip.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Floating-point Reciprocal Step
 //----------------------------------------------------------------------
 
+    frecps h21, h16, h13
     frecps s21, s16, s13
     frecps d22, d30, d21
 
+// CHECK: frecps  h21, h16, h13           // encoding: [0x15,0x3e,0x4d,0x5e]
 // CHECK: frecps s21, s16, s13   // encoding: [0x15,0xfe,0x2d,0x5e]
 // CHECK: frecps d22, d30, d21   // encoding: [0xd6,0xff,0x75,0x5e]
 
@@ -16,9 +18,11 @@
 // Floating-point Reciprocal Square Root Step
 //----------------------------------------------------------------------
 
+    frsqrts h21, h5, h12
     frsqrts s21, s5, s12
     frsqrts d8, d22, d18
 
+// CHECK: frsqrts h21, h5, h12            // encoding: [0xb5,0x3c,0xcc,0x5e]
 // CHECK: frsqrts s21, s5, s12   // encoding: [0xb5,0xfc,0xac,0x5e]
 // CHECK: frsqrts d8, d22, d18   // encoding: [0xc8,0xfe,0xf2,0x5e]
 
@@ -26,9 +30,11 @@
 // Scalar Floating-point Reciprocal Estimate
 //----------------------------------------------------------------------
 
+    frecpe h19, h14
     frecpe s19, s14
     frecpe d13, d13
 
+// CHECK: frecpe  h19, h14                // encoding: [0xd3,0xd9,0xf9,0x5e]
 // CHECK: frecpe s19, s14    // encoding: [0xd3,0xd9,0xa1,0x5e]
 // CHECK: frecpe d13, d13    // encoding: [0xad,0xd9,0xe1,0x5e]
 
@@ -36,9 +42,11 @@
 // Scalar Floating-point Reciprocal Exponent
 //----------------------------------------------------------------------
 
+    frecpx h18, h10
     frecpx s18, s10
     frecpx d16, d19
 
+// CHECK: frecpx  h18, h10                // encoding: [0x52,0xf9,0xf9,0x5e]
 // CHECK: frecpx s18, s10    // encoding: [0x52,0xf9,0xa1,0x5e]
 // CHECK: frecpx d16, d19    // encoding: [0x70,0xfa,0xe1,0x5e]
 
@@ -46,8 +54,10 @@
 // Scalar Floating-point Reciprocal Square Root Estimate
 //----------------------------------------------------------------------
 
+    frsqrte h22, h13
     frsqrte s22, s13
     frsqrte d21, d12
 
+// CHECK: frsqrte h22, h13                // encoding: [0xb6,0xd9,0xf9,0x7e]
 // CHECK: frsqrte s22, s13    // encoding: [0xb6,0xd9,0xa1,0x7e]
 // CHECK: frsqrte d21, d12    // encoding: [0x95,0xd9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-reduce-pairwise.s b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
index 403a940ec2f2..dae61d0f0f32 100644
--- a/test/MC/AArch64/neon-scalar-reduce-pairwise.s
+++ b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Integer)
@@ -10,7 +10,12 @@
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Floating Point)
 //----------------------------------------------------------------------
+      faddp h18, v3.2h
+      faddp h18, v3.2H
+      faddp s19, v2.2s
       faddp d20, v1.2d
 
+// CHECK: faddp h18, v3.2h     // encoding: [0x72,0xd8,0x30,0x5e]
+// CHECK: faddp s19, v2.2s     // encoding: [0x53,0xd8,0x30,0x7e]
 // CHECK: faddp d20, v1.2d     // encoding: [0x34,0xd8,0x70,0x7e]
 
diff --git a/test/MC/AArch64/neon-simd-misc.s b/test/MC/AArch64/neon-simd-misc.s
index 6d1aafdd7725..32dd48629cd8 100644
--- a/test/MC/AArch64/neon-simd-misc.s
+++ b/test/MC/AArch64/neon-simd-misc.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -298,10 +298,14 @@
 // Floating-point absolute
 //------------------------------------------------------------------------------
 
+         fabs v4.4h, v0.4h
+         fabs v6.8h, v8.8h
          fabs v6.4s, v8.4s
          fabs v6.2d, v8.2d
          fabs v4.2s, v0.2s
 
+// CHECK: fabs    v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf8,0x0e]
+// CHECK: fabs    v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf8,0x4e]
 // CHECK:	fabs	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x4e]
 // CHECK:	fabs	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x4e]
 // CHECK:	fabs	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x0e]
@@ -310,10 +314,14 @@
 // Floating-point negate
 //------------------------------------------------------------------------------
 
+         fneg v4.4h, v0.4h
+         fneg v6.8h, v8.8h
          fneg v6.4s, v8.4s
          fneg v6.2d, v8.2d
          fneg v4.2s, v0.2s
 
+// CHECK: fneg    v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf8,0x2e]
+// CHECK: fneg    v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf8,0x6e]
 // CHECK:	fneg	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x6e]
 // CHECK:	fneg	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x6e]
 // CHECK:	fneg	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x2e]
@@ -450,58 +458,86 @@
 // Floating-point round to integral
 //------------------------------------------------------------------------------
 
+         frintn v4.4h, v0.4h
+         frintn v6.8h, v8.8h
          frintn v6.4s, v8.4s
          frintn v6.2d, v8.2d
          frintn v4.2s, v0.2s
 
+// CHECK: frintn  v4.4h, v0.4h            // encoding: [0x04,0x88,0x79,0x0e]
+// CHECK: frintn  v6.8h, v8.8h            // encoding: [0x06,0x89,0x79,0x4e]
 // CHECK:	frintn	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x4e]
 // CHECK:	frintn	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x4e]
 // CHECK:	frintn	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x0e]
 
+         frinta v4.4h, v0.4h
+         frinta v6.8h, v8.8h
          frinta v6.4s, v8.4s
          frinta v6.2d, v8.2d
          frinta v4.2s, v0.2s
 
+// CHECK: frinta  v4.4h, v0.4h            // encoding: [0x04,0x88,0x79,0x2e]
+// CHECK: frinta  v6.8h, v8.8h            // encoding: [0x06,0x89,0x79,0x6e]
 // CHECK:	frinta	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x6e]
 // CHECK:	frinta	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x6e]
 // CHECK:	frinta	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x2e]
 
+         frintp v4.4h, v0.4h
+         frintp v6.8h, v8.8h
          frintp v6.4s, v8.4s
          frintp v6.2d, v8.2d
          frintp v4.2s, v0.2s
 
+// CHECK: frintp  v4.4h, v0.4h            // encoding: [0x04,0x88,0xf9,0x0e]
+// CHECK: frintp  v6.8h, v8.8h            // encoding: [0x06,0x89,0xf9,0x4e]
 // CHECK:	frintp	v6.4s, v8.4s            // encoding: [0x06,0x89,0xa1,0x4e]
 // CHECK:	frintp	v6.2d, v8.2d            // encoding: [0x06,0x89,0xe1,0x4e]
 // CHECK:	frintp	v4.2s, v0.2s            // encoding: [0x04,0x88,0xa1,0x0e]
 
+         frintm v4.4h, v0.4h
+         frintm v6.8h, v8.8h
          frintm v6.4s, v8.4s
          frintm v6.2d, v8.2d
          frintm v4.2s, v0.2s
 
+// CHECK: frintm  v4.4h, v0.4h            // encoding: [0x04,0x98,0x79,0x0e]
+// CHECK: frintm  v6.8h, v8.8h            // encoding: [0x06,0x99,0x79,0x4e]
 // CHECK:	frintm	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x4e]
 // CHECK:	frintm	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x4e]
 // CHECK:	frintm	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x0e]
 
+         frintx v4.4h, v0.4h
+         frintx v6.8h, v8.8h
          frintx v6.4s, v8.4s
          frintx v6.2d, v8.2d
          frintx v4.2s, v0.2s
 
+// CHECK: frintx  v4.4h, v0.4h            // encoding: [0x04,0x98,0x79,0x2e]
+// CHECK: frintx  v6.8h, v8.8h            // encoding: [0x06,0x99,0x79,0x6e]
 // CHECK:	frintx	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x6e]
 // CHECK:	frintx	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x6e]
 // CHECK:	frintx	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x2e]
 
+         frintz v4.4h, v0.4h
+         frintz v6.8h, v8.8h
          frintz v6.4s, v8.4s
          frintz v6.2d, v8.2d
          frintz v4.2s, v0.2s
 
+// CHECK: frintz  v4.4h, v0.4h            // encoding: [0x04,0x98,0xf9,0x0e]
+// CHECK: frintz  v6.8h, v8.8h            // encoding: [0x06,0x99,0xf9,0x4e]
 // CHECK:	frintz	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x4e]
 // CHECK:	frintz	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x4e]
 // CHECK:	frintz	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x0e]
 
+         frinti v4.4h, v0.4h
+         frinti v6.8h, v8.8h
          frinti v6.4s, v8.4s
          frinti v6.2d, v8.2d
          frinti v4.2s, v0.2s
 
+// CHECK: frinti  v4.4h, v0.4h            // encoding: [0x04,0x98,0xf9,0x2e]
+// CHECK: frinti  v6.8h, v8.8h            // encoding: [0x06,0x99,0xf9,0x6e]
 // CHECK:	frinti	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x6e]
 // CHECK:	frinti	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x6e]
 // CHECK:	frinti	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x2e]
@@ -510,83 +546,123 @@
 // Floating-point convert to integer
 //------------------------------------------------------------------------------
 
+         fcvtns v4.4h, v0.4h
+         fcvtns v6.8h, v8.8h
          fcvtns v6.4s, v8.4s
          fcvtns v6.2d, v8.2d
          fcvtns v4.2s, v0.2s
 
+// CHECK: fcvtns  v4.4h, v0.4h            // encoding: [0x04,0xa8,0x79,0x0e]
+// CHECK: fcvtns  v6.8h, v8.8h            // encoding: [0x06,0xa9,0x79,0x4e]
 // CHECK:	fcvtns	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x4e]
 // CHECK:	fcvtns	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x4e]
 // CHECK:	fcvtns	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x0e]
 
+         fcvtnu v4.4h, v0.4h
+         fcvtnu v6.8h, v8.8h
          fcvtnu v6.4s, v8.4s
          fcvtnu v6.2d, v8.2d
          fcvtnu v4.2s, v0.2s
 
+// CHECK: fcvtnu  v4.4h, v0.4h            // encoding: [0x04,0xa8,0x79,0x2e]
+// CHECK: fcvtnu  v6.8h, v8.8h            // encoding: [0x06,0xa9,0x79,0x6e]
 // CHECK:	fcvtnu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x6e]
 // CHECK:	fcvtnu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x6e]
 // CHECK:	fcvtnu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x2e]
 
+         fcvtps v4.4h, v0.4h
+         fcvtps v6.8h, v8.8h
          fcvtps v6.4s, v8.4s
          fcvtps v6.2d, v8.2d
          fcvtps v4.2s, v0.2s
 
+// CHECK: fcvtps  v4.4h, v0.4h            // encoding: [0x04,0xa8,0xf9,0x0e]
+// CHECK: fcvtps  v6.8h, v8.8h            // encoding: [0x06,0xa9,0xf9,0x4e]
 // CHECK:	fcvtps	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x4e]
 // CHECK:	fcvtps	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x4e]
 // CHECK:	fcvtps	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x0e]
 
+         fcvtpu v4.4h, v0.4h
+         fcvtpu v6.8h, v8.8h
          fcvtpu v6.4s, v8.4s
          fcvtpu v6.2d, v8.2d
          fcvtpu v4.2s, v0.2s
 
+// CHECK: fcvtpu  v4.4h, v0.4h            // encoding: [0x04,0xa8,0xf9,0x2e]
+// CHECK: fcvtpu  v6.8h, v8.8h            // encoding: [0x06,0xa9,0xf9,0x6e]
 // CHECK:	fcvtpu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x6e]
 // CHECK:	fcvtpu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x6e]
 // CHECK:	fcvtpu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x2e]
 
+         fcvtms v4.4h, v0.4h
+         fcvtms v6.8h, v8.8h
          fcvtms v6.4s, v8.4s
          fcvtms v6.2d, v8.2d
          fcvtms v4.2s, v0.2s
 
+// CHECK: fcvtms  v4.4h, v0.4h            // encoding: [0x04,0xb8,0x79,0x0e]
+// CHECK: fcvtms  v6.8h, v8.8h            // encoding: [0x06,0xb9,0x79,0x4e]
 // CHECK:	fcvtms	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x4e]
 // CHECK:	fcvtms	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x4e]
 // CHECK:	fcvtms	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x0e]
 
+         fcvtmu v4.4h, v0.4h
+         fcvtmu v6.8h, v8.8h
          fcvtmu v6.4s, v8.4s
          fcvtmu v6.2d, v8.2d
          fcvtmu v4.2s, v0.2s
 
+// CHECK: fcvtmu  v4.4h, v0.4h            // encoding: [0x04,0xb8,0x79,0x2e]
+// CHECK: fcvtmu  v6.8h, v8.8h            // encoding: [0x06,0xb9,0x79,0x6e]
 // CHECK:	fcvtmu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x6e]
 // CHECK:	fcvtmu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x6e]
 // CHECK:	fcvtmu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x2e]
 
+         fcvtzs v4.4h, v0.4h
+         fcvtzs v6.8h, v8.8h
          fcvtzs v6.4s, v8.4s
          fcvtzs v6.2d, v8.2d
          fcvtzs v4.2s, v0.2s
 
+// CHECK: fcvtzs  v4.4h, v0.4h            // encoding: [0x04,0xb8,0xf9,0x0e]
+// CHECK: fcvtzs  v6.8h, v8.8h            // encoding: [0x06,0xb9,0xf9,0x4e]
 // CHECK:	fcvtzs	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x4e]
 // CHECK:	fcvtzs	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x4e]
 // CHECK:	fcvtzs	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x0e]
 
 
+         fcvtzu v4.4h, v0.4h
+         fcvtzu v6.8h, v8.8h
          fcvtzu v6.4s, v8.4s
          fcvtzu v6.2d, v8.2d
          fcvtzu v4.2s, v0.2s
 
+// CHECK: fcvtzu  v4.4h, v0.4h            // encoding: [0x04,0xb8,0xf9,0x2e]
+// CHECK: fcvtzu  v6.8h, v8.8h            // encoding: [0x06,0xb9,0xf9,0x6e]
 // CHECK:	fcvtzu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x6e]
 // CHECK:	fcvtzu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x6e]
 // CHECK:	fcvtzu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x2e]
 
+         fcvtas v4.4h, v0.4h
+         fcvtas v6.8h, v8.8h
          fcvtas v6.4s, v8.4s
          fcvtas v6.2d, v8.2d
          fcvtas v4.2s, v0.2s
 
+// CHECK: fcvtas  v4.4h, v0.4h            // encoding: [0x04,0xc8,0x79,0x0e]
+// CHECK: fcvtas  v6.8h, v8.8h            // encoding: [0x06,0xc9,0x79,0x4e]
 // CHECK:	fcvtas	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x4e]
 // CHECK:	fcvtas	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x4e]
 // CHECK:	fcvtas	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x0e]
 
+         fcvtau v4.4h, v0.4h
+         fcvtau v6.8h, v8.8h
          fcvtau v6.4s, v8.4s
          fcvtau v6.2d, v8.2d
          fcvtau v4.2s, v0.2s
 
+// CHECK: fcvtau  v4.4h, v0.4h            // encoding: [0x04,0xc8,0x79,0x2e]
+// CHECK: fcvtau  v6.8h, v8.8h            // encoding: [0x06,0xc9,0x79,0x6e]
 // CHECK:	fcvtau	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x6e]
 // CHECK:	fcvtau	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x6e]
 // CHECK:	fcvtau	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x2e]
@@ -603,42 +679,62 @@
 // CHECK:	ursqrte	v6.4s, v8.4s            // encoding: [0x06,0xc9,0xa1,0x6e]
 // CHECK:	ursqrte	v4.2s, v0.2s            // encoding: [0x04,0xc8,0xa1,0x2e]
 
+         scvtf v4.4h, v0.4h
+         scvtf v6.8h, v8.8h
          scvtf v6.4s, v8.4s
          scvtf v6.2d, v8.2d
          scvtf v4.2s, v0.2s
 
+// CHECK: scvtf   v4.4h, v0.4h            // encoding: [0x04,0xd8,0x79,0x0e]
+// CHECK: scvtf   v6.8h, v8.8h            // encoding: [0x06,0xd9,0x79,0x4e]
 // CHECK:	scvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x4e]
 // CHECK:	scvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x4e]
 // CHECK:	scvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x0e]
 
+         ucvtf v4.4h, v0.4h
+         ucvtf v6.8h, v8.8h
          ucvtf v6.4s, v8.4s
          ucvtf v6.2d, v8.2d
          ucvtf v4.2s, v0.2s
 
+// CHECK: ucvtf   v4.4h, v0.4h            // encoding: [0x04,0xd8,0x79,0x2e]
+// CHECK: ucvtf   v6.8h, v8.8h            // encoding: [0x06,0xd9,0x79,0x6e]
 // CHECK:	ucvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x6e]
 // CHECK:	ucvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x6e]
 // CHECK:	ucvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x2e]
 
+         frecpe v4.4h, v0.4h
+         frecpe v6.8h, v8.8h
          frecpe v6.4s, v8.4s
          frecpe v6.2d, v8.2d
          frecpe v4.2s, v0.2s
 
+// CHECK: frecpe  v4.4h, v0.4h            // encoding: [0x04,0xd8,0xf9,0x0e]
+// CHECK: frecpe  v6.8h, v8.8h            // encoding: [0x06,0xd9,0xf9,0x4e]
 // CHECK:	frecpe	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x4e]
 // CHECK:	frecpe	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x4e]
 // CHECK:	frecpe	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x0e]
 
+         frsqrte v4.4h, v0.4h
+         frsqrte v6.8h, v8.8h
          frsqrte v6.4s, v8.4s
          frsqrte v6.2d, v8.2d
          frsqrte v4.2s, v0.2s
 
+// CHECK: frsqrte v4.4h, v0.4h            // encoding: [0x04,0xd8,0xf9,0x2e]
+// CHECK: frsqrte v6.8h, v8.8h            // encoding: [0x06,0xd9,0xf9,0x6e]
 // CHECK:	frsqrte	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x6e]
 // CHECK:	frsqrte	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x6e]
 // CHECK:	frsqrte	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x2e]
 
+         fsqrt v4.4h, v0.4h
+         fsqrt v6.8h, v8.8h
          fsqrt v6.4s, v8.4s
          fsqrt v6.2d, v8.2d
          fsqrt v4.2s, v0.2s
 
+// CHECK: fsqrt   v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf9,0x2e]
+// CHECK: fsqrt   v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf9,0x6e]
 // CHECK:	fsqrt	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa1,0x6e]
 // CHECK:	fsqrt	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe1,0x6e]
 // CHECK:	fsqrt	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa1,0x2e]
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
index a16432324efc..4638c535a6a7 100644
--- a/test/MC/AArch64/neon-simd-shift.s
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -400,16 +400,24 @@
 //------------------------------------------------------------------------------
 // Fixed-point convert to floating-point
 //------------------------------------------------------------------------------
+         scvtf v0.4h, v1.4h, #3
+         scvtf v0.8h, v1.8h, #3
          scvtf v0.2s, v1.2s, #3
          scvtf v0.4s, v1.4s, #3
          scvtf v0.2d, v1.2d, #3
+         ucvtf v0.4h, v1.4h, #3
+         ucvtf v0.8h, v1.8h, #3
          ucvtf v0.2s, v1.2s, #3
          ucvtf v0.4s, v1.4s, #3
          ucvtf v0.2d, v1.2d, #3
 
+// CHECK: scvtf v0.4h, v1.4h, #3        // encoding: [0x20,0xe4,0x1d,0x0f]
+// CHECK: scvtf v0.8h, v1.8h, #3        // encoding: [0x20,0xe4,0x1d,0x4f]
 // CHECK:	scvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x0f]
 // CHECK:	scvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x4f]
 // CHECK:	scvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x4f]
+// CHECK:	ucvtf v0.4h, v1.4h, #3        // encoding: [0x20,0xe4,0x1d,0x2f]
+// CHECK:	ucvtf v0.8h, v1.8h, #3        // encoding: [0x20,0xe4,0x1d,0x6f]
 // CHECK:	ucvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x2f]
 // CHECK:	ucvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x6f]
 // CHECK:	ucvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x6f]
@@ -417,17 +425,25 @@
 //------------------------------------------------------------------------------
 // Floating-point convert to fixed-point
 //------------------------------------------------------------------------------
+         fcvtzs v0.4h, v1.4h, #3
+         fcvtzs v0.8h, v1.8h, #3
          fcvtzs v0.2s, v1.2s, #3
          fcvtzs v0.4s, v1.4s, #3
          fcvtzs v0.2d, v1.2d, #3
+         fcvtzu v0.4h, v1.4h, #3
+         fcvtzu v0.8h, v1.8h, #3
          fcvtzu v0.2s, v1.2s, #3
          fcvtzu v0.4s, v1.4s, #3
          fcvtzu v0.2d, v1.2d, #3
 
 
+// CHECK:	fcvtzs  v0.4h, v1.4h, #3        // encoding: [0x20,0xfc,0x1d,0x0f]
+// CHECK:	fcvtzs  v0.8h, v1.8h, #3        // encoding: [0x20,0xfc,0x1d,0x4f]
 // CHECK:	fcvtzs	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x0f]
 // CHECK:	fcvtzs	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x4f]
 // CHECK:	fcvtzs	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x4f]
+// CHECK:	fcvtzu  v0.4h, v1.4h, #3        // encoding: [0x20,0xfc,0x1d,0x2f]
+// CHECK:	fcvtzu  v0.8h, v1.8h, #3        // encoding: [0x20,0xfc,0x1d,0x6f]
 // CHECK:	fcvtzu	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x2f]
 // CHECK:	fcvtzu	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x6f]
 // CHECK:	fcvtzu	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x6f]
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index a1f13b76dda3..99a3cfa7b29e 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -349,6 +349,8 @@ Lforward:
     and r6, r7, r8, ror r2
     and r10, r1, r6, rrx
     and r2, r3, #0x7fffffff
+    and sp, sp, #0x7fffffff
+    and pc, pc, #0x7fffffff
 
     @ destination register is optional
     and r1, #0xf
@@ -397,6 +399,8 @@ Lforward:
 @ CHECK: and	r6, r7, r8, ror r2      @ encoding: [0x78,0x62,0x07,0xe0]
 @ CHECK: and	r10, r1, r6, rrx        @ encoding: [0x66,0xa0,0x01,0xe0]
 @ CHECK: bic	r2, r3, #-2147483648    @ encoding: [0x02,0x21,0xc3,0xe3]
+@ CHECK: bic	sp, sp, #-2147483648    @ encoding: [0x02,0xd1,0xcd,0xe3]
+@ CHECK: bic	pc, pc, #-2147483648    @ encoding: [0x02,0xf1,0xcf,0xe3]
 
 @ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
 @ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
@@ -502,6 +506,10 @@ Lforward:
         bic r6, r7, r8, asr r2
         bic r6, r7, r8, ror r2
         bic r10, r1, r6, rrx
+        bic r2, r3, #0x7fffffff
+        bic sp, sp, #0x7fffffff
+        bic pc, pc, #0x7fffffff
+
 
         @ destination register is optional
         bic r1, #0xf
@@ -548,6 +556,9 @@ Lforward:
 @ CHECK: bic	r6, r7, r8, asr r2      @ encoding: [0x58,0x62,0xc7,0xe1]
 @ CHECK: bic	r6, r7, r8, ror r2      @ encoding: [0x78,0x62,0xc7,0xe1]
 @ CHECK: bic	r10, r1, r6, rrx        @ encoding: [0x66,0xa0,0xc1,0xe1]
+@ CHECK: and  r2, r3, #-2147483648    @ encoding: [0x02,0x21,0x03,0xe2]
+@ CHECK: and  sp, sp, #-2147483648    @ encoding: [0x02,0xd1,0x0d,0xe2]
+@ CHECK: and  pc, pc, #-2147483648    @ encoding: [0x02,0xf1,0x0f,0xe2]
 
 
 @ CHECK: bic	r1, r1, #15             @ encoding: [0x0f,0x10,0xc1,0xe3]
diff --git a/test/MC/ARM/data-in-code.ll b/test/MC/ARM/data-in-code.ll
index c4910ff20e61..10657a3fed39 100644
--- a/test/MC/ARM/data-in-code.ll
+++ b/test/MC/ARM/data-in-code.ll
@@ -51,13 +51,6 @@ exit:
 ;; ARM-NEXT:     Other:
 ;; ARM-NEXT:     Section: [[MIXED_SECT]]
 
-;; ARM:        Symbol {
-;; ARM:          Name: $d
-;; ARM-NEXT:     Value: 0
-;; ARM-NEXT:     Size: 0
-;; ARM-NEXT:     Binding: Local
-;; ARM-NEXT:     Type: None
-
 ;; ARM:        Symbol {
 ;; ARM:          Name: $d
 ;; ARM-NEXT:     Value: 0x{{[0-9A-F]+}}
@@ -77,10 +70,17 @@ exit:
 ;; ARM-NEXT:     Section: .ARM.exidx
 ;; ARM-NEXT:   }
 
+;; ARM:        Symbol {
+;; ARM:          Name: $d
+;; ARM-NEXT:     Value: 0
+;; ARM-NEXT:     Size: 0
+;; ARM-NEXT:     Binding: Local
+;; ARM-NEXT:     Type: None
+
 ;; ARM-NOT:     ${{[atd]}}
 
 ;; TMB:        Symbol {
-;; TMB:          Name: $d.2
+;; TMB:          Name: $d.1
 ;; TMB-NEXT:     Value: 0x{{[0-9A-F]+}}
 ;; TMB-NEXT:     Size: 0
 ;; TMB-NEXT:     Binding: Local
diff --git a/test/MC/ARM/neon-vcvt-fp16.s b/test/MC/ARM/neon-vcvt-fp16.s
new file mode 100644
index 000000000000..a23be061c0fa
--- /dev/null
+++ b/test/MC/ARM/neon-vcvt-fp16.s
@@ -0,0 +1,18 @@
+@ RUN: llvm-mc -mcpu=cortex-r7 -triple arm -show-encoding < %s 2>&1| \
+@ RUN:    FileCheck %s --check-prefix=CHECK-FP16
+@ RUN: not llvm-mc -mcpu=cortex-r5 -triple arm -show-encoding < %s 2>&1 | \
+@ RUN:    FileCheck %s --check-prefix=CHECK-NOFP16
+
+@ CHECK-FP16: vcvtt.f32.f16	s7, s1         @ encoding: [0xe0,0x3a,0xf2,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtt.f32.f16	s7, s1
+@ CHECK-FP16: vcvtt.f16.f32	s1, s7         @ encoding: [0xe3,0x0a,0xf3,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtt.f16.f32	s1, s7
+
+@ CHECK-FP16: vcvtb.f32.f16	s7, s1         @ encoding: [0x60,0x3a,0xf2,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtb.f32.f16	s7, s1
+@ CHECK-FP16: vcvtb.f16.f32	s1, s7         @ encoding: [0x63,0x0a,0xf3,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtb.f16.f32	s1, s7
diff --git a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
index e83d750e715e..217424cc46e0 100644
--- a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
+++ b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
@@ -1,10 +1,6 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+spe --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu --disassemble < %s | FileCheck --check-prefix=NO_SPE %s
 
-[0x1f,0x22,0x03,0xd5]
-# CHECK: hint #0x10
-# NO_SPE: hint #0x10
-
 [0x3f,0x22,0x03,0xd5]
 # CHECK: psb csync
 # NO_SPE: hint #0x11
diff --git a/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt b/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt
new file mode 100644
index 000000000000..8b7e1c878002
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt
@@ -0,0 +1,382 @@
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=+neon,-fullfp16 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=-neon,-fullfp16 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=-neon,+fullfp16 < %s 2>&1 | FileCheck %s
+
+[0x00,0xf8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x10,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x11,0x12,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x50,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x51,0x12,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x2f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x6f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xc8,0x30,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xc8,0xb0,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xf8,0x30,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xf8,0xb0,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0x50,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0x4f,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x25,0x4c,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x27,0x5c,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x25,0x4c,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x27,0x5c,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0xd0,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0xcf,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0xd0,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0xcf,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xdb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xd8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xdb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xd8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0xc9,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0xcb,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0xc9,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0xcb,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xcb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xc8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xcb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xc8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x83,0xda,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x01,0xd9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x83,0xda,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x01,0xd9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x50,0xe8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x8f,0xe8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x50,0xe8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x8f,0xe8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x2f,0x50,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x2c,0x4f,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x2f,0x50,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x2c,0x4f,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x2d,0xcc,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x2f,0xdc,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x2d,0xcc,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x2f,0xdc,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x3f,0xd0,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x3c,0xcf,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x3d,0x4c,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x3f,0x5c,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xff,0x35,0x50,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xa3,0x34,0xc6,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xff,0x05,0x50,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xa3,0x04,0xc6,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0xc2,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0xc2,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x1d,0x17,0xd4,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x18,0x11,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x62,0x58,0x14,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x98,0x11,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x46,0x98,0x18,0x7f]
+# CHECK: warning: invalid instruction encoding
+[0x95,0xfd,0x1f,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x95,0xfd,0x1f,0x7f]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xc9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xc9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xb9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xa9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xa9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xa9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xa9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0x4c,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0x4c,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0xcc,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xe9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xe9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x2d,0x4c,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x2d,0xcc,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xd4,0x1e,0x4f,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x15,0x3e,0x4d,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xb5,0x3c,0xcc,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xd3,0xd9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x52,0xf9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xd9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x72,0xd8,0x30,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xc8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xc9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xc8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xc9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xd8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xd9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xd8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xd9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+
+# CHECK-NOT: warning: invalid instruction encoding
diff --git a/test/MC/Disassembler/Mips/dsp/valid.txt b/test/MC/Disassembler/Mips/dsp/valid.txt
new file mode 100644
index 000000000000..e6ca900dde55
--- /dev/null
+++ b/test/MC/Disassembler/Mips/dsp/valid.txt
@@ -0,0 +1,125 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mattr=dsp | FileCheck %s
+
+  0x7c 0x02 0x0a 0x52 # CHECK: absq_s.ph        $1, $2
+  0x7c 0x06 0x2c 0x52 # CHECK: absq_s.w         $5, $6
+  0x7d 0x09 0x3a 0x90 # CHECK: addq.ph          $7, $8, $9
+  0x7d 0x6c 0x53 0x90 # CHECK: addq_s.ph        $10, $11, $12
+  0x7d 0xcf 0x6d 0x90 # CHECK: addq_s.w         $13, $14, $15
+  0x7f 0xbe 0xe4 0x10 # CHECK: addsc            $gp, $sp, $fp
+  0x7c 0xe8 0x30 0x10 # CHECK: addu.qb          $6, $7, $8
+  0x7d 0x4b 0x49 0x10 # CHECK: addu_s.qb        $9, $10, $11
+  0x7d 0xae 0x64 0x50 # CHECK: addwc            $12, $13, $14
+  0x7c 0x1a 0xce 0xd2 # CHECK: bitrev           $25, $26
+  0x04 0x1c 0x14 0x9b # CHECK: bposge32         21104
+  0x7f 0x7c 0x02 0x11 # CHECK: cmp.eq.ph        $27, $gp
+  0x7f 0xbe 0x02 0x51 # CHECK: cmp.lt.ph        $sp, $fp
+  0x7f 0xe1 0x02 0x91 # CHECK: cmp.le.ph        $ra, $1
+  0x7d 0x8d 0x59 0x11 # CHECK: cmpgu.eq.qb      $11, $12, $13
+  0x7d 0xf0 0x71 0x51 # CHECK: cmpgu.lt.qb      $14, $15, $16
+  0x7e 0x53 0x89 0x91 # CHECK: cmpgu.le.qb      $17, $18, $19
+  0x7e 0x95 0x00 0x11 # CHECK: cmpu.eq.qb       $20, $21
+  0x7e 0xd7 0x00 0x51 # CHECK: cmpu.lt.qb       $22, $23
+  0x7f 0x19 0x00 0x91 # CHECK: cmpu.le.qb       $24, $25
+  0x7c 0x22 0x09 0x30 # CHECK: dpaq_s.w.ph      $ac1, $1, $2
+  0x7c 0x64 0x13 0x30 # CHECK: dpaq_sa.l.w      $ac2, $3, $4
+  0x7d 0x2a 0x08 0xf0 # CHECK: dpau.h.qbl       $ac1, $9, $10
+  0x7d 0x6c 0x09 0xf0 # CHECK: dpau.h.qbr       $ac1, $11, $12
+  0x7e 0x32 0x01 0x70 # CHECK: dpsq_s.w.ph      $ac0, $17, $18
+  0x7e 0x74 0x0b 0x70 # CHECK: dpsq_sa.l.w      $ac1, $19, $20
+  0x7c 0xa6 0x02 0xf0 # CHECK: dpsu.h.qbl       $ac0, $5, $6
+  0x7c 0xe8 0x0b 0xf0 # CHECK: dpsu.h.qbr       $ac1, $7, $8
+  0x7f 0xe1 0x00 0xb8 # CHECK: extp             $1, $ac0, 31
+  0x7c 0x02 0x0a 0xb8 # CHECK: extpdp           $2, $ac1, 0
+  0x7c 0x83 0x12 0xf8 # CHECK: extpdpv          $3, $ac2, $4
+  0x7c 0xc5 0x18 0xf8 # CHECK: extpv            $5, $ac3, $6
+  0x7f 0xe7 0x00 0x38 # CHECK: extr.w           $7, $ac0, 31
+  0x7d 0xe8 0x09 0x38 # CHECK: extr_r.w         $8, $ac1, 15
+  0x7c 0xe9 0x11 0xb8 # CHECK: extr_rs.w        $9, $ac2, 7
+  0x7c 0x6a 0x1b 0xb8 # CHECK: extr_s.h         $10, $ac3, 3
+  0x7d 0x8b 0x00 0x78 # CHECK: extrv.w          $11, $ac0, $12
+  0x7d 0xcd 0x09 0x78 # CHECK: extrv_r.w        $13, $ac1, $14
+  0x7e 0x0f 0x11 0xf8 # CHECK: extrv_rs.w       $15, $ac2, $16
+  0x7e 0x51 0x1b 0xf8 # CHECK: extrv_s.h        $17, $ac3, $18
+  0x7e 0x93 0x00 0x0c # CHECK: insv             $19, $20
+  0x7f 0x54 0x51 0x8a # CHECK: lbux             $10, $20($26)
+  0x7f 0x75 0x59 0x0a # CHECK: lhx              $11, $21($27)
+  0x7f 0x96 0x60 0x0a # CHECK: lwx              $12, $22($gp)
+  0x70 0xc7 0x08 0x00 # CHECK: madd             $ac1, $6, $7
+  0x71 0x09 0x08 0x01 # CHECK: maddu            $ac1, $8, $9
+  0x70 0xc7 0x00 0x00 # CHECK: madd             $6, $7
+  0x71 0x09 0x00 0x01 # CHECK: maddu            $8, $9
+  0x7c 0x64 0x15 0x30 # CHECK: maq_s.w.phl      $ac2, $3, $4
+  0x7c 0xa6 0x1c 0x30 # CHECK: maq_sa.w.phl     $ac3, $5, $6
+  0x7c 0xe8 0x05 0xb0 # CHECK: maq_s.w.phr      $ac0, $7, $8
+  0x7d 0x2a 0x0c 0xb0 # CHECK: maq_sa.w.phr     $ac1, $9, $10
+  0x00 0x20 0x70 0x10 # CHECK: mfhi             $14, $ac1
+  0x00 0x20 0x78 0x12 # CHECK: mflo             $15, $ac1
+  0x00 0x00 0x70 0x10 # CHECK: mfhi             $14
+  0x00 0x00 0x78 0x12 # CHECK: mflo             $15
+  0x7d 0x8d 0x5c 0x90 # CHECK: modsub           $11, $12, $13
+  0x71 0x4b 0x18 0x04 # CHECK: msub             $ac3, $10, $11
+  0x71 0x8d 0x10 0x05 # CHECK: msubu            $ac2, $12, $13
+  0x71 0x4b 0x00 0x04 # CHECK: msub             $10, $11
+  0x71 0x8d 0x00 0x05 # CHECK: msubu            $12, $13
+  0x02 0x00 0x18 0x11 # CHECK: mthi             $16, $ac3
+  0x02 0x00 0x00 0x11 # CHECK: mthi             $16
+  0x7d 0xc0 0x17 0xf8 # CHECK: mthlip           $14, $ac2
+  0x02 0x20 0x10 0x13 # CHECK: mtlo             $17, $ac2
+  0x02 0x20 0x00 0x13 # CHECK: mtlo             $17
+  0x7e 0xd7 0xaf 0x10 # CHECK: muleq_s.w.phl    $21, $22, $23
+  0x7f 0x3a 0xc7 0x50 # CHECK: muleq_s.w.phr    $24, $25, $26
+  0x7f 0x9d 0xd9 0x90 # CHECK: muleu_s.ph.qbl   $27, $gp, $sp
+  0x7f 0xe1 0xf1 0xd0 # CHECK: muleu_s.ph.qbr   $fp, $ra, $1
+  0x7c 0x64 0x17 0xd0 # CHECK: mulq_rs.ph       $2, $3, $4
+  0x7e 0x11 0x01 0xb0 # CHECK: mulsaq_s.w.ph    $ac0, $16, $17
+  0x00 0x43 0x18 0x18 # CHECK: mult             $ac3, $2, $3
+  0x00 0x85 0x10 0x19 # CHECK: multu            $ac2, $4, $5
+  0x00 0x43 0x00 0x18 # CHECK: mult             $2, $3
+  0x00 0x85 0x00 0x19 # CHECK: multu            $4, $5
+  0x7e 0x74 0x93 0x91 # CHECK: packrl.ph        $18, $19, $20
+  0x7d 0xe3 0x3a 0xd1 # CHECK: pick.ph          $7, $15, $3
+  0x7c 0x88 0x10 0xd1 # CHECK: pick.qb          $2, $4, $8
+  0x7c 0x15 0xa3 0x12 # CHECK: preceq.w.phl     $20, $21
+  0x7c 0x16 0xab 0x52 # CHECK: preceq.w.phr     $21, $22
+  0x7c 0x17 0xb1 0x12 # CHECK: precequ.ph.qbl   $22, $23
+  0x7c 0x19 0xc1 0x92 # CHECK: precequ.ph.qbla  $24, $25
+  0x7c 0x18 0xb9 0x52 # CHECK: precequ.ph.qbr   $23, $24
+  0x7c 0x1a 0xc9 0xd2 # CHECK: precequ.ph.qbra  $25, $26
+  0x7c 0x1b 0xd7 0x12 # CHECK: preceu.ph.qbl    $26, $27
+  0x7c 0x1d 0xe7 0x92 # CHECK: preceu.ph.qbla   $gp, $sp
+  0x7c 0x1c 0xdf 0x52 # CHECK: preceu.ph.qbr    $27, $gp
+  0x7c 0x1e 0xef 0xd2 # CHECK: preceu.ph.qbra   $sp, $fp
+  0x7e 0x53 0x8d 0x11 # CHECK: precrq.ph.w      $17, $18, $19
+  0x7e 0x32 0x83 0x11 # CHECK: precrq.qb.ph     $16, $17, $18
+  0x7e 0x95 0x9b 0xd1 # CHECK: precrqu_s.qb.ph  $19, $20, $21
+  0x7e 0x74 0x95 0x51 # CHECK: precrq_rs.ph.w   $18, $19, $20
+  0x7c 0x40 0x0d 0x10 # CHECK: raddu.w.qb       $1, $2
+  0x7d 0x00 0x2c 0xb8 # CHECK: rddsp            $5, 256
+  0x7c 0x0c 0x12 0x92 # CHECK: repl.ph          $2, 12
+  0x7c 0x55 0x08 0x92 # CHECK: repl.qb          $1, 85
+  0x7c 0x02 0x0a 0xd2 # CHECK: replv.ph         $1, $2
+  0x7c 0x02 0x08 0xd2 # CHECK: replv.qb         $1, $2
+  0x7d 0x00 0x0e 0xb8 # CHECK: shilo            $ac1, 16
+  0x7c 0x40 0x0e 0xf8 # CHECK: shilov           $ac1, $2
+  0x7c 0x62 0x0a 0x13 # CHECK: shll.ph          $1, $2, 3
+  0x7c 0x62 0x0b 0x13 # CHECK: shll_s.ph        $1, $2, 3
+  0x7c 0x62 0x08 0x13 # CHECK: shll.qb          $1, $2, 3
+  0x7c 0x62 0x0a 0x93 # CHECK: shllv.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0x93 # CHECK: shllv_s.ph       $1, $2, $3
+  0x7c 0x62 0x08 0x93 # CHECK: shllv.qb         $1, $2, $3
+  0x7c 0x62 0x0d 0x93 # CHECK: shllv_s.w        $1, $2, $3
+  0x7c 0x62 0x0d 0x13 # CHECK: shll_s.w         $1, $2, 3
+  0x7c 0x22 0x2a 0x53 # CHECK: shra.ph          $5, $2, 1
+  0x7c 0x22 0x2b 0x53 # CHECK: shra_r.ph        $5, $2, 1
+  0x7c 0x62 0x0a 0xd3 # CHECK: shrav.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0xd3 # CHECK: shrav_r.ph       $1, $2, $3
+  0x7c 0x62 0x0d 0xd3 # CHECK: shrav_r.w        $1, $2, $3
+  0x7c 0x22 0x0d 0x53 # CHECK: shra_r.w         $1, $2, 1
+  0x7c 0x42 0x08 0x53 # CHECK: shrl.qb          $1, $2, 2
+  0x7c 0x62 0x08 0xd3 # CHECK: shrlv.qb         $1, $2, $3
+  0x7c 0x43 0x0a 0xd0 # CHECK: subq.ph          $1, $2, $3
+  0x7c 0x43 0x0b 0xd0 # CHECK: subq_s.ph        $1, $2, $3
+  0x7c 0x43 0x0d 0xd0 # CHECK: subq_s.w         $1, $2, $3
+  0x7c 0x43 0x08 0x50 # CHECK: subu.qb          $1, $2, $3
+  0x7c 0x43 0x09 0x50 # CHECK: subu_s.qb        $1, $2, $3
+  0x7c 0x20 0x04 0xf8 # CHECK: wrdsp            $1, 0
diff --git a/test/MC/Disassembler/Mips/dspr2/valid.txt b/test/MC/Disassembler/Mips/dspr2/valid.txt
new file mode 100644
index 000000000000..b1b5a332dc56
--- /dev/null
+++ b/test/MC/Disassembler/Mips/dspr2/valid.txt
@@ -0,0 +1,173 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mattr=dspr2 | FileCheck %s
+
+  0x7c 0x02 0x0a 0x52 # CHECK: absq_s.ph        $1, $2
+  0x7c 0x04 0x18 0x52 # CHECK: absq_s.qb        $3, $4
+  0x7c 0x06 0x2c 0x52 # CHECK: absq_s.w         $5, $6
+  0x7d 0x09 0x3a 0x90 # CHECK: addq.ph          $7, $8, $9
+  0x7d 0x6c 0x53 0x90 # CHECK: addq_s.ph        $10, $11, $12
+  0x7d 0xcf 0x6d 0x90 # CHECK: addq_s.w         $13, $14, $15
+  0x7e 0x32 0x82 0x18 # CHECK: addqh.ph         $16, $17, $18
+  0x7e 0x95 0x9a 0x98 # CHECK: addqh_r.ph       $19, $20, $21
+  0x7e 0xf8 0xb4 0x18 # CHECK: addqh.w          $22, $23, $24
+  0x7f 0x5b 0xcc 0x98 # CHECK: addqh_r.w        $25, $26, $27
+  0x7f 0xbe 0xe4 0x10 # CHECK: addsc            $gp, $sp, $fp
+  0x7c 0x22 0xfa 0x10 # CHECK: addu.ph          $ra, $1, $2
+  0x7c 0x85 0x1b 0x10 # CHECK: addu_s.ph        $3, $4, $5
+  0x7c 0xe8 0x30 0x10 # CHECK: addu.qb          $6, $7, $8
+  0x7d 0x4b 0x49 0x10 # CHECK: addu_s.qb        $9, $10, $11
+  0x7d 0xae 0x64 0x50 # CHECK: addwc            $12, $13, $14
+  0x7e 0x11 0x78 0x18 # CHECK: adduh.qb         $15, $16, $17
+  0x7e 0x74 0x90 0x98 # CHECK: adduh_r.qb       $18, $19, $20
+  0x7e 0xd5 0x00 0x31 # CHECK: append           $21, $22, 0
+  0x7f 0x17 0x1c 0x31 # CHECK: balign           $23, $24, 3
+  0x7c 0x1a 0xce 0xd2 # CHECK: bitrev           $25, $26
+  0x04 0x1c 0x14 0x9b # CHECK: bposge32         21104
+  0x7f 0x7c 0x02 0x11 # CHECK: cmp.eq.ph        $27, $gp
+  0x7f 0xbe 0x02 0x51 # CHECK: cmp.lt.ph        $sp, $fp
+  0x7f 0xe1 0x02 0x91 # CHECK: cmp.le.ph        $ra, $1
+  0x7c 0x64 0x16 0x11 # CHECK: cmpgdu.eq.qb     $2, $3, $4
+  0x7c 0xc7 0x2e 0x51 # CHECK: cmpgdu.lt.qb     $5, $6, $7
+  0x7d 0x2a 0x46 0x91 # CHECK: cmpgdu.le.qb     $8, $9, $10
+  0x7d 0x8d 0x59 0x11 # CHECK: cmpgu.eq.qb      $11, $12, $13
+  0x7d 0xf0 0x71 0x51 # CHECK: cmpgu.lt.qb      $14, $15, $16
+  0x7e 0x53 0x89 0x91 # CHECK: cmpgu.le.qb      $17, $18, $19
+  0x7e 0x95 0x00 0x11 # CHECK: cmpu.eq.qb       $20, $21
+  0x7e 0xd7 0x00 0x51 # CHECK: cmpu.lt.qb       $22, $23
+  0x7f 0x19 0x00 0x91 # CHECK: cmpu.le.qb       $24, $25
+  0x7f 0x5b 0x00 0x30 # CHECK: dpa.w.ph         $ac0, $26, $27
+  0x7c 0x22 0x09 0x30 # CHECK: dpaq_s.w.ph      $ac1, $1, $2
+  0x7c 0x64 0x13 0x30 # CHECK: dpaq_sa.l.w      $ac2, $3, $4
+  0x7c 0xa6 0x1e 0x30 # CHECK: dpaqx_s.w.ph     $ac3, $5, $6
+  0x7c 0xe8 0x06 0xb0 # CHECK: dpaqx_sa.w.ph    $ac0, $7, $8
+  0x7d 0x2a 0x08 0xf0 # CHECK: dpau.h.qbl       $ac1, $9, $10
+  0x7d 0x6c 0x09 0xf0 # CHECK: dpau.h.qbr       $ac1, $11, $12
+  0x7d 0xae 0x12 0x30 # CHECK: dpax.w.ph        $ac2, $13, $14
+  0x7d 0xf0 0x18 0x70 # CHECK: dps.w.ph         $ac3, $15, $16
+  0x7e 0x32 0x01 0x70 # CHECK: dpsq_s.w.ph      $ac0, $17, $18
+  0x7e 0x74 0x0b 0x70 # CHECK: dpsq_sa.l.w      $ac1, $19, $20
+  0x7c 0x22 0x16 0x70 # CHECK: dpsqx_s.w.ph     $ac2, $1, $2
+  0x7c 0x64 0x1e 0xf0 # CHECK: dpsqx_sa.w.ph    $ac3, $3, $4
+  0x7c 0xa6 0x02 0xf0 # CHECK: dpsu.h.qbl       $ac0, $5, $6
+  0x7c 0xe8 0x0b 0xf0 # CHECK: dpsu.h.qbr       $ac1, $7, $8
+  0x7d 0x2a 0x12 0x70 # CHECK: dpsx.w.ph        $ac2, $9, $10
+  0x7f 0xe1 0x00 0xb8 # CHECK: extp             $1, $ac0, 31
+  0x7c 0x02 0x0a 0xb8 # CHECK: extpdp           $2, $ac1, 0
+  0x7c 0x83 0x12 0xf8 # CHECK: extpdpv          $3, $ac2, $4
+  0x7c 0xc5 0x18 0xf8 # CHECK: extpv            $5, $ac3, $6
+  0x7f 0xe7 0x00 0x38 # CHECK: extr.w           $7, $ac0, 31
+  0x7d 0xe8 0x09 0x38 # CHECK: extr_r.w         $8, $ac1, 15
+  0x7c 0xe9 0x11 0xb8 # CHECK: extr_rs.w        $9, $ac2, 7
+  0x7c 0x6a 0x1b 0xb8 # CHECK: extr_s.h         $10, $ac3, 3
+  0x7d 0x8b 0x00 0x78 # CHECK: extrv.w          $11, $ac0, $12
+  0x7d 0xcd 0x09 0x78 # CHECK: extrv_r.w        $13, $ac1, $14
+  0x7e 0x0f 0x11 0xf8 # CHECK: extrv_rs.w       $15, $ac2, $16
+  0x7e 0x51 0x1b 0xf8 # CHECK: extrv_s.h        $17, $ac3, $18
+  0x7e 0x93 0x00 0x0c # CHECK: insv             $19, $20
+  0x7f 0x54 0x51 0x8a # CHECK: lbux             $10, $20($26)
+  0x7f 0x75 0x59 0x0a # CHECK: lhx              $11, $21($27)
+  0x7f 0x96 0x60 0x0a # CHECK: lwx              $12, $22($gp)
+  0x70 0xc7 0x08 0x00 # CHECK: madd             $ac1, $6, $7
+  0x71 0x09 0x08 0x01 # CHECK: maddu            $ac1, $8, $9
+  0x70 0xc7 0x00 0x00 # CHECK: madd             $6, $7
+  0x71 0x09 0x00 0x01 # CHECK: maddu            $8, $9
+  0x7c 0x64 0x15 0x30 # CHECK: maq_s.w.phl      $ac2, $3, $4
+  0x7c 0xa6 0x1c 0x30 # CHECK: maq_sa.w.phl     $ac3, $5, $6
+  0x7c 0xe8 0x05 0xb0 # CHECK: maq_s.w.phr      $ac0, $7, $8
+  0x7d 0x2a 0x0c 0xb0 # CHECK: maq_sa.w.phr     $ac1, $9, $10
+  0x00 0x20 0x70 0x10 # CHECK: mfhi             $14, $ac1
+  0x00 0x20 0x78 0x12 # CHECK: mflo             $15, $ac1
+  0x00 0x00 0x70 0x10 # CHECK: mfhi             $14
+  0x00 0x00 0x78 0x12 # CHECK: mflo             $15
+  0x7d 0x8d 0x5c 0x90 # CHECK: modsub           $11, $12, $13
+  0x71 0x4b 0x18 0x04 # CHECK: msub             $ac3, $10, $11
+  0x71 0x8d 0x10 0x05 # CHECK: msubu            $ac2, $12, $13
+  0x71 0x4b 0x00 0x04 # CHECK: msub             $10, $11
+  0x71 0x8d 0x00 0x05 # CHECK: msubu            $12, $13
+  0x02 0x00 0x18 0x11 # CHECK: mthi             $16, $ac3
+  0x02 0x00 0x00 0x11 # CHECK: mthi             $16
+  0x7d 0xc0 0x17 0xf8 # CHECK: mthlip           $14, $ac2
+  0x02 0x20 0x10 0x13 # CHECK: mtlo             $17, $ac2
+  0x02 0x20 0x00 0x13 # CHECK: mtlo             $17
+  0x7e 0x11 0x7b 0x18 # CHECK: mul.ph           $15, $16, $17
+  0x7e 0x74 0x93 0x98 # CHECK: mul_s.ph         $18, $19, $20
+  0x7e 0xd7 0xaf 0x10 # CHECK: muleq_s.w.phl    $21, $22, $23
+  0x7f 0x3a 0xc7 0x50 # CHECK: muleq_s.w.phr    $24, $25, $26
+  0x7f 0x9d 0xd9 0x90 # CHECK: muleu_s.ph.qbl   $27, $gp, $sp
+  0x7f 0xe1 0xf1 0xd0 # CHECK: muleu_s.ph.qbr   $fp, $ra, $1
+  0x7c 0x64 0x17 0xd0 # CHECK: mulq_rs.ph       $2, $3, $4
+  0x7c 0xc7 0x2d 0xd8 # CHECK: mulq_rs.w        $5, $6, $7
+  0x7d 0x2a 0x47 0x90 # CHECK: mulq_s.ph        $8, $9, $10
+  0x7d 0x8d 0x5d 0x98 # CHECK: mulq_s.w         $11, $12, $13
+  0x7d 0xcf 0x18 0xb0 # CHECK: mulsa.w.ph       $ac3, $14, $15
+  0x7e 0x11 0x01 0xb0 # CHECK: mulsaq_s.w.ph    $ac0, $16, $17
+  0x00 0x43 0x18 0x18 # CHECK: mult             $ac3, $2, $3
+  0x00 0x85 0x10 0x19 # CHECK: multu            $ac2, $4, $5
+  0x00 0x43 0x00 0x18 # CHECK: mult             $2, $3
+  0x00 0x85 0x00 0x19 # CHECK: multu            $4, $5
+  0x7e 0x74 0x93 0x91 # CHECK: packrl.ph        $18, $19, $20
+  0x7d 0xe3 0x3a 0xd1 # CHECK: pick.ph          $7, $15, $3
+  0x7c 0x88 0x10 0xd1 # CHECK: pick.qb          $2, $4, $8
+  0x7c 0x15 0xa3 0x12 # CHECK: preceq.w.phl     $20, $21
+  0x7c 0x16 0xab 0x52 # CHECK: preceq.w.phr     $21, $22
+  0x7c 0x17 0xb1 0x12 # CHECK: precequ.ph.qbl   $22, $23
+  0x7c 0x19 0xc1 0x92 # CHECK: precequ.ph.qbla  $24, $25
+  0x7c 0x18 0xb9 0x52 # CHECK: precequ.ph.qbr   $23, $24
+  0x7c 0x1a 0xc9 0xd2 # CHECK: precequ.ph.qbra  $25, $26
+  0x7c 0x1b 0xd7 0x12 # CHECK: preceu.ph.qbl    $26, $27
+  0x7c 0x1d 0xe7 0x92 # CHECK: preceu.ph.qbla   $gp, $sp
+  0x7c 0x1c 0xdf 0x52 # CHECK: preceu.ph.qbr    $27, $gp
+  0x7c 0x1e 0xef 0xd2 # CHECK: preceu.ph.qbra   $sp, $fp
+  0x7f 0x19 0xbb 0x51 # CHECK: precr.qb.ph      $23, $24, $25
+  0x7f 0x38 0x07 0x91 # CHECK: precr_sra.ph.w   $24, $25, 0
+  0x7f 0x38 0xff 0x91 # CHECK: precr_sra.ph.w   $24, $25, 31
+  0x7f 0x59 0x07 0xd1 # CHECK: precr_sra_r.ph.w $25, $26, 0
+  0x7f 0x59 0xff 0xd1 # CHECK: precr_sra_r.ph.w $25, $26, 31
+  0x7e 0x53 0x8d 0x11 # CHECK: precrq.ph.w      $17, $18, $19
+  0x7e 0x32 0x83 0x11 # CHECK: precrq.qb.ph     $16, $17, $18
+  0x7e 0x95 0x9b 0xd1 # CHECK: precrqu_s.qb.ph  $19, $20, $21
+  0x7e 0x74 0x95 0x51 # CHECK: precrq_rs.ph.w   $18, $19, $20
+  0x7c 0x41 0x18 0x71 # CHECK: prepend          $1, $2, 3
+  0x7c 0x40 0x0d 0x10 # CHECK: raddu.w.qb       $1, $2
+  0x7d 0x00 0x2c 0xb8 # CHECK: rddsp            $5, 256
+  0x7c 0x0c 0x12 0x92 # CHECK: repl.ph          $2, 12
+  0x7c 0x55 0x08 0x92 # CHECK: repl.qb          $1, 85
+  0x7c 0x02 0x0a 0xd2 # CHECK: replv.ph         $1, $2
+  0x7c 0x02 0x08 0xd2 # CHECK: replv.qb         $1, $2
+  0x7d 0x00 0x0e 0xb8 # CHECK: shilo            $ac1, 16
+  0x7c 0x40 0x0e 0xf8 # CHECK: shilov           $ac1, $2
+  0x7c 0x62 0x0a 0x13 # CHECK: shll.ph          $1, $2, 3
+  0x7c 0x62 0x0b 0x13 # CHECK: shll_s.ph        $1, $2, 3
+  0x7c 0x62 0x08 0x13 # CHECK: shll.qb          $1, $2, 3
+  0x7c 0x62 0x0a 0x93 # CHECK: shllv.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0x93 # CHECK: shllv_s.ph       $1, $2, $3
+  0x7c 0x62 0x08 0x93 # CHECK: shllv.qb         $1, $2, $3
+  0x7c 0x62 0x0d 0x93 # CHECK: shllv_s.w        $1, $2, $3
+  0x7c 0x62 0x0d 0x13 # CHECK: shll_s.w         $1, $2, 3
+  0x7c 0x50 0x11 0x13 # CHECK: shra.qb          $2, $16, 2
+  0x7c 0x50 0x11 0x53 # CHECK: shra_r.qb        $2, $16, 2
+  0x7c 0x22 0x2a 0x53 # CHECK: shra.ph $5,      $2, 1
+  0x7c 0x22 0x2b 0x53 # CHECK: shra_r.ph        $5, $2, 1
+  0x7c 0x62 0x0a 0xd3 # CHECK: shrav.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0xd3 # CHECK: shrav_r.ph       $1, $2, $3
+  0x7c 0x62 0x09 0x93 # CHECK: shrav.qb         $1, $2, $3
+  0x7c 0x62 0x09 0xd3 # CHECK: shrav_r.qb       $1, $2, $3
+  0x7c 0x62 0x0d 0xd3 # CHECK: shrav_r.w        $1, $2, $3
+  0x7c 0x22 0x0d 0x53 # CHECK: shra_r.w         $1, $2, 1
+  0x7c 0x42 0x0e 0x53 # CHECK: shrl.ph          $1, $2, 2
+  0x7c 0x42 0x08 0x53 # CHECK: shrl.qb          $1, $2, 2
+  0x7c 0x62 0x0e 0xd3 # CHECK: shrlv.ph         $1, $2, $3
+  0x7c 0x62 0x08 0xd3 # CHECK: shrlv.qb         $1, $2, $3
+  0x7c 0x43 0x0a 0xd0 # CHECK: subq.ph          $1, $2, $3
+  0x7c 0x43 0x0b 0xd0 # CHECK: subq_s.ph        $1, $2, $3
+  0x7c 0x43 0x0d 0xd0 # CHECK: subq_s.w         $1, $2, $3
+  0x7c 0x43 0x0a 0x58 # CHECK: subqh.ph         $1, $2, $3
+  0x7c 0x43 0x0a 0xd8 # CHECK: subqh_r.ph       $1, $2, $3
+  0x7c 0x43 0x0c 0x58 # CHECK: subqh.w          $1, $2, $3
+  0x7c 0x43 0x0c 0xd8 # CHECK: subqh_r.w        $1, $2, $3
+  0x7c 0x49 0x32 0x50 # CHECK: subu.ph          $6, $2, $9
+  0x7c 0x64 0x13 0x50 # CHECK: subu_s.ph        $2, $3, $4
+  0x7c 0x43 0x08 0x50 # CHECK: subu.qb          $1, $2, $3
+  0x7c 0x43 0x09 0x50 # CHECK: subu_s.qb        $1, $2, $3
+  0x7c 0x43 0x08 0x58 # CHECK: subuh.qb         $1, $2, $3
+  0x7c 0x43 0x08 0xd8 # CHECK: subuh_r.qb       $1, $2, $3
+  0x7c 0x20 0x04 0xf8 # CHECK: wrdsp            $1, 0
diff --git a/test/MC/ELF/empty-twice.ll b/test/MC/ELF/empty-twice.ll
new file mode 100644
index 000000000000..c24bd629c416
--- /dev/null
+++ b/test/MC/ELF/empty-twice.ll
@@ -0,0 +1,6 @@
+; Check that there is no persistent state in the ELF emitter that crashes us
+; when we try to reuse the pass manager
+; RUN: llc -compile-twice -filetype=obj %s -o -
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-pc-linux-gnu"
diff --git a/test/MC/Hexagon/test.s b/test/MC/Hexagon/test.s
new file mode 100644
index 000000000000..e60578e65930
--- /dev/null
+++ b/test/MC/Hexagon/test.s
@@ -0,0 +1,4 @@
+#RUN: llvm-mc -filetype=obj -triple=hexagon -mcpu=hexagonv60 %s
+
+{ vmem (r0 + #0) = v0
+  r0 = memw(r0) } 
\ No newline at end of file
diff --git a/test/MC/Hexagon/v60-alu.s b/test/MC/Hexagon/v60-alu.s
new file mode 100644
index 000000000000..1583c3da2cb7
--- /dev/null
+++ b/test/MC/Hexagon/v60-alu.s
@@ -0,0 +1,312 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1ce2cbd7 { v23.w = vavg(v11.w,{{ *}}v2.w):rnd }
+v23.w=vavg(v11.w,v2.w):rnd
+
+#CHECK: 1cf4d323 { v3.h = vnavg(v19.h,{{ *}}v20.h) }
+v3.h=vnavg(v19.h,v20.h)
+
+#CHECK: 1cffce9a { v26.uh = vavg(v14.uh,{{ *}}v31.uh):rnd }
+v26.uh=vavg(v14.uh,v31.uh):rnd
+
+#CHECK: 1ce5cba1 { v1.h = vavg(v11.h,{{ *}}v5.h):rnd }
+v1.h=vavg(v11.h,v5.h):rnd
+
+#CHECK: 1cc0d012 { v18.ub = vabsdiff(v16.ub,{{ *}}v0.ub) }
+v18.ub=vabsdiff(v16.ub,v0.ub)
+
+#CHECK: 1cc2de29 { v9.uh = vabsdiff(v30.h,{{ *}}v2.h) }
+v9.uh=vabsdiff(v30.h,v2.h)
+
+#CHECK: 1ce9ca06 { v6.b = vnavg(v10.ub,{{ *}}v9.ub) }
+v6.b=vnavg(v10.ub,v9.ub)
+
+#CHECK: 1caacf90 { v17:16.w = vadd(v15.h,{{ *}}v10.h) }
+v17:16.w=vadd(v15.h,v10.h)
+
+#CHECK: 1cb4cabe { v31:30.h = vsub(v10.ub,{{ *}}v20.ub) }
+v31:30.h=vsub(v10.ub,v20.ub)
+
+#CHECK: 1cb8cada { v27:26.w = vsub(v10.uh,{{ *}}v24.uh) }
+v27:26.w=vsub(v10.uh,v24.uh)
+
+#CHECK: 1cbcdbe8 { v9:8.w = vsub(v27.h,{{ *}}v28.h) }
+v9:8.w=vsub(v27.h,v28.h)
+
+#CHECK: 1caeca00 { v1:0.h = vsub(v11:10.h,{{ *}}v15:14.h):sat }
+v1:0.h=vsub(v11:10.h,v15:14.h):sat
+
+#CHECK: 1ca8c43e { v31:30.w = vsub(v5:4.w,{{ *}}v9:8.w):sat }
+v31:30.w=vsub(v5:4.w,v9:8.w):sat
+
+#CHECK: 1cbad95c { v29:28.h = vadd(v25.ub,{{ *}}v26.ub) }
+v29:28.h=vadd(v25.ub,v26.ub)
+
+#CHECK: 1ca1dc64 { v5:4.w = vadd(v28.uh,{{ *}}v1.uh) }
+v5:4.w=vadd(v28.uh,v1.uh)
+
+#CHECK: 1c79c350 { v16.h = vsub(v3.h,{{ *}}v25.h):sat }
+v16.h=vsub(v3.h,v25.h):sat
+
+#CHECK: 1c7fd364 { v4.w = vsub(v19.w,{{ *}}v31.w):sat }
+v4.w=vsub(v19.w,v31.w):sat
+
+#CHECK: 1c67d816 { v22.ub = vsub(v24.ub,{{ *}}v7.ub):sat }
+v22.ub=vsub(v24.ub,v7.ub):sat
+
+#CHECK: 1c7ddc2f { v15.uh = vsub(v28.uh,{{ *}}v29.uh):sat }
+v15.uh=vsub(v28.uh,v29.uh):sat
+
+#CHECK: 1c5cc6d7 { v23.h = vsub(v6.h,{{ *}}v28.h) }
+v23.h=vsub(v6.h,v28.h)
+
+#CHECK: 1c54cae4 { v4.w = vsub(v10.w,{{ *}}v20.w) }
+v4.w=vsub(v10.w,v20.w)
+
+#CHECK: 1c4dc78b { v11.w = vadd(v7.w,{{ *}}v13.w):sat }
+v11.w=vadd(v7.w,v13.w):sat
+
+#CHECK: 1c48c7a4 { v4.b = vsub(v7.b,{{ *}}v8.b) }
+v4.b=vsub(v7.b,v8.b)
+
+#CHECK: 1cdec3b0 { v16.uh = vavg(v3.uh,{{ *}}v30.uh) }
+v16.uh=vavg(v3.uh,v30.uh)
+
+#CHECK: 1c76dc98 { v25:24.b = vadd(v29:28.b,{{ *}}v23:22.b) }
+v25:24.b=vadd(v29:28.b,v23:22.b)
+
+#CHECK: 1c7ad4a6 { v7:6.h = vadd(v21:20.h,{{ *}}v27:26.h) }
+v7:6.h=vadd(v21:20.h,v27:26.h)
+
+#CHECK: 1cc7c564 { v4.uw = vabsdiff(v5.w,{{ *}}v7.w) }
+v4.uw=vabsdiff(v5.w,v7.w)
+
+#CHECK: 1cd2cdc1 { v1.h = vavg(v13.h,{{ *}}v18.h) }
+v1.h=vavg(v13.h,v18.h)
+
+#CHECK: 1cd5d246 { v6.uh = vabsdiff(v18.uh,{{ *}}v21.uh) }
+v6.uh=vabsdiff(v18.uh,v21.uh)
+
+#CHECK: 1cdcd987 { v7.ub = vavg(v25.ub,{{ *}}v28.ub) }
+v7.ub=vavg(v25.ub,v28.ub)
+
+#CHECK: 1c92c6e4 { v5:4.uh = vsub(v7:6.uh,{{ *}}v19:18.uh):sat }
+v5:4.uh=vsub(v7:6.uh,v19:18.uh):sat
+
+#CHECK: 1c86dace { v15:14.ub = vsub(v27:26.ub,{{ *}}v7:6.ub):sat }
+v15:14.ub=vsub(v27:26.ub,v7:6.ub):sat
+
+#CHECK: 1cffc07c { v28.ub = vavg(v0.ub,{{ *}}v31.ub):rnd }
+v28.ub=vavg(v0.ub,v31.ub):rnd
+
+#CHECK: 1cf8d851 { v17.w = vnavg(v24.w,{{ *}}v24.w) }
+v17.w=vnavg(v24.w,v24.w)
+
+#CHECK: 1c70d2e6 { v7:6.ub = vadd(v19:18.ub,{{ *}}v17:16.ub):sat }
+v7:6.ub=vadd(v19:18.ub,v17:16.ub):sat
+
+#CHECK: 1c72dec6 { v7:6.w = vadd(v31:30.w,{{ *}}v19:18.w) }
+v7:6.w=vadd(v31:30.w,v19:18.w)
+
+#CHECK: 1c92d23e { v31:30.h = vadd(v19:18.h,{{ *}}v19:18.h):sat }
+v31:30.h=vadd(v19:18.h,v19:18.h):sat
+
+#CHECK: 1c94de1e { v31:30.uh = vadd(v31:30.uh,{{ *}}v21:20.uh):sat }
+v31:30.uh=vadd(v31:30.uh,v21:20.uh):sat
+
+#CHECK: 1c9ec07c { v29:28.b = vsub(v1:0.b,{{ *}}v31:30.b) }
+v29:28.b=vsub(v1:0.b,v31:30.b)
+
+#CHECK: 1c88da56 { v23:22.w = vadd(v27:26.w,{{ *}}v9:8.w):sat }
+v23:22.w=vadd(v27:26.w,v9:8.w):sat
+
+#CHECK: 1c9acab8 { v25:24.w = vsub(v11:10.w,{{ *}}v27:26.w) }
+v25:24.w=vsub(v11:10.w,v27:26.w)
+
+#CHECK: 1c82d282 { v3:2.h = vsub(v19:18.h,{{ *}}v3:2.h) }
+v3:2.h=vsub(v19:18.h,v3:2.h)
+
+#CHECK: 1c2bd9a6 { v6 = vand(v25,{{ *}}v11) }
+v6=vand(v25,v11)
+
+#CHECK: 1c43c22d { v13.ub = vadd(v2.ub,{{ *}}v3.ub):sat }
+v13.ub=vadd(v2.ub,v3.ub):sat
+
+#CHECK: 1c59d707 { v7.w = vadd(v23.w,{{ *}}v25.w) }
+v7.w=vadd(v23.w,v25.w)
+
+#CHECK: 1c3fc9e1 { v1 = vxor(v9,{{ *}}v31) }
+v1=vxor(v9,v31)
+
+#CHECK: 1c2acbdf { v31 = vor(v11,{{ *}}v10) }
+v31=vor(v11,v10)
+
+#CHECK: 1cdaccf6 { v22.w = vavg(v12.w,{{ *}}v26.w) }
+v22.w=vavg(v12.w,v26.w)
+
+#CHECK: 1c5ac767 { v7.h = vadd(v7.h,{{ *}}v26.h):sat }
+v7.h=vadd(v7.h,v26.h):sat
+
+#CHECK: 1c40d956 { v22.uh = vadd(v25.uh,{{ *}}v0.uh):sat }
+v22.uh=vadd(v25.uh,v0.uh):sat
+
+#CHECK: 1fbbd611 { v17.w = vasr(v22.w{{ *}},{{ *}}v27.w) }
+v17.w=vasr(v22.w,v27.w)
+
+#CHECK: 1fbad835 { v21.w = vlsr(v24.w{{ *}},{{ *}}v26.w) }
+v21.w=vlsr(v24.w,v26.w)
+
+#CHECK: 1f79cedc { v28.b = vround(v14.h{{ *}},{{ *}}v25.h):sat }
+v28.b=vround(v14.h,v25.h):sat
+
+#CHECK: 1f69c4e0 { v0.ub = vround(v4.h{{ *}},{{ *}}v9.h):sat }
+v0.ub=vround(v4.h,v9.h):sat
+
+#CHECK: 1f72c485 { v5.h = vround(v4.w{{ *}},{{ *}}v18.w):sat }
+v5.h=vround(v4.w,v18.w):sat
+
+#CHECK: 1f6bc8b1 { v17.uh = vround(v8.w{{ *}},{{ *}}v11.w):sat }
+v17.uh=vround(v8.w,v11.w):sat
+
+#CHECK: 1f71c25b { v27.ub = vsat(v2.h{{ *}},{{ *}}v17.h) }
+v27.ub=vsat(v2.h,v17.h)
+
+#CHECK: 1f66c560 { v0.h = vsat(v5.w{{ *}},{{ *}}v6.w) }
+v0.h=vsat(v5.w,v6.w)
+
+#CHECK: 1fb3d148 { v8.h = vlsr(v17.h{{ *}},{{ *}}v19.h) }
+v8.h=vlsr(v17.h,v19.h)
+
+#CHECK: 1fbec56e { v14.h = vasr(v5.h{{ *}},{{ *}}v30.h) }
+v14.h=vasr(v5.h,v30.h)
+
+#CHECK: 1fb2d2a2 { v2.h = vasl(v18.h{{ *}},{{ *}}v18.h) }
+v2.h=vasl(v18.h,v18.h)
+
+#CHECK: 1faccc95 { v21.w = vasl(v12.w{{ *}},{{ *}}v12.w) }
+v21.w=vasl(v12.w,v12.w)
+
+#CHECK: 1fb9c1e2 { v2.h = vadd(v1.h{{ *}},{{ *}}v25.h) }
+v2.h=vadd(v1.h,v25.h)
+
+#CHECK: 1fbbd5df { v31.b = vadd(v21.b{{ *}},{{ *}}v27.b) }
+v31.b=vadd(v21.b,v27.b)
+
+#CHECK: 1f25c578 { v24 = vrdelta(v5{{ *}},{{ *}}v5) }
+v24=vrdelta(v5,v5)
+
+#CHECK: 1f22c62a { v10 = vdelta(v6{{ *}},{{ *}}v2) }
+v10=vdelta(v6,v2)
+
+#CHECK: 1f20d102 { v2.w = vmax(v17.w{{ *}},{{ *}}v0.w) }
+v2.w=vmax(v17.w,v0.w)
+
+#CHECK: 1f1ed6fc { v28.h = vmax(v22.h{{ *}},{{ *}}v30.h) }
+v28.h=vmax(v22.h,v30.h)
+
+#CHECK: 1f0cc8d8 { v24.uh = vmax(v8.uh{{ *}},{{ *}}v12.uh) }
+v24.uh=vmax(v8.uh,v12.uh)
+
+#CHECK: 1f00c1b0 { v16.ub = vmax(v1.ub{{ *}},{{ *}}v0.ub) }
+v16.ub=vmax(v1.ub,v0.ub)
+
+#CHECK: 1f12d08e { v14.w = vmin(v16.w{{ *}},{{ *}}v18.w) }
+v14.w=vmin(v16.w,v18.w)
+
+#CHECK: 1f1ad466 { v6.h = vmin(v20.h{{ *}},{{ *}}v26.h) }
+v6.h=vmin(v20.h,v26.h)
+
+#CHECK: 1f13df5d { v29.uh = vmin(v31.uh{{ *}},{{ *}}v19.uh) }
+v29.uh=vmin(v31.uh,v19.uh)
+
+#CHECK: 1f09c226 { v6.ub = vmin(v2.ub{{ *}},{{ *}}v9.ub) }
+v6.ub=vmin(v2.ub,v9.ub)
+
+#CHECK: 1f41d34f { v15.b = vshuffo(v19.b{{ *}},{{ *}}v1.b) }
+v15.b=vshuffo(v19.b,v1.b)
+
+#CHECK: 1f5fc72e { v14.b = vshuffe(v7.b{{ *}},{{ *}}v31.b) }
+v14.b=vshuffe(v7.b,v31.b)
+
+#CHECK: 1f34d0f7 { v23.b = vdeale(v16.b{{ *}},{{ *}}v20.b) }
+v23.b=vdeale(v16.b,v20.b)
+
+#CHECK: 1f4bd6c4 { v5:4.b = vshuffoe(v22.b{{ *}},{{ *}}v11.b) }
+v5:4.b=vshuffoe(v22.b,v11.b)
+
+#CHECK: 1f5dcea2 { v3:2.h = vshuffoe(v14.h{{ *}},{{ *}}v29.h) }
+v3:2.h=vshuffoe(v14.h,v29.h)
+
+#CHECK: 1f4fd186 { v6.h = vshuffo(v17.h{{ *}},{{ *}}v15.h) }
+v6.h=vshuffo(v17.h,v15.h)
+
+#CHECK: 1f5bda79 { v25.h = vshuffe(v26.h{{ *}},{{ *}}v27.h) }
+v25.h=vshuffe(v26.h,v27.h)
+
+#CHECK: 1f41d1f2 { v19:18 = vcombine(v17{{ *}},{{ *}}v1) }
+v19:18=vcombine(v17,v1)
+
+#CHECK: 1e82f432 { if (!q2) v18.b -= v20.b }
+if (!q2) v18.b-=v20.b
+
+#CHECK: 1ec2fd13 { if (q3) v19.w -= v29.w }
+if (q3) v19.w-=v29.w
+
+#CHECK: 1e81fef9 { if (q2) v25.h -= v30.h }
+if (q2) v25.h-=v30.h
+
+#CHECK: 1e81e2d3 { if (q2) v19.b -= v2.b }
+if (q2) v19.b-=v2.b
+
+#CHECK: 1e41ecad { if (!q1) v13.w += v12.w }
+if (!q1) v13.w+=v12.w
+
+#CHECK: 1e41e789 { if (!q1) v9.h += v7.h }
+if (!q1) v9.h+=v7.h
+
+#CHECK: 1e81e967 { if (!q2) v7.b += v9.b }
+if (!q2) v7.b+=v9.b
+
+#CHECK: 1e41f04f { if (q1) v15.w += v16.w }
+if (q1) v15.w+=v16.w
+
+#CHECK: 1e01e838 { if (q0) v24.h += v8.h }
+if (q0) v24.h+=v8.h
+
+#CHECK: 1ec1f112 { if (q3) v18.b += v17.b }
+if (q3) v18.b+=v17.b
+
+#CHECK: 1e42f67b { if (!q1) v27.w -= v22.w }
+if (!q1) v27.w-=v22.w
+
+#CHECK: 1e82ea5b { if (!q2) v27.h -= v10.h }
+if (!q2) v27.h-=v10.h
+
+#CHECK: 1e00c586 { v6 = vnot(v5) }
+v6=vnot(v5)
+
+#CHECK: 1e00df70 { v16.w = vabs(v31.w):sat }
+v16.w=vabs(v31.w):sat
+
+#CHECK: 1e00d45f { v31.w = vabs(v20.w) }
+v31.w=vabs(v20.w)
+
+#CHECK: 1e00db2f { v15.h = vabs(v27.h):sat }
+v15.h=vabs(v27.h):sat
+
+#CHECK: 1e00d001 { v1.h = vabs(v16.h) }
+v1.h=vabs(v16.h)
+
+#CHECK: 1e02c832 { v19:18.uh = vzxt(v8.ub) }
+v19:18.uh=vzxt(v8.ub)
+
+#CHECK: 1e02c98a { v11:10.w = vsxt(v9.h) }
+v11:10.w=vsxt(v9.h)
+
+#CHECK: 1e02cf76 { v23:22.h = vsxt(v15.b) }
+v23:22.h=vsxt(v15.b)
+
+#CHECK: 1e02c258 { v25:24.uw = vzxt(v2.uh) }
+v25:24.uw=vzxt(v2.uh)
diff --git a/test/MC/Hexagon/v60-permute.s b/test/MC/Hexagon/v60-permute.s
new file mode 100644
index 000000000000..b3544bd0a57b
--- /dev/null
+++ b/test/MC/Hexagon/v60-permute.s
@@ -0,0 +1,51 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1fd2d5cf { v15.b = vpack(v21.h{{ *}},{{ *}}v18.h):sat }
+v15.b=vpack(v21.h,v18.h):sat
+
+#CHECK: 1fd7d7a2 { v2.ub = vpack(v23.h{{ *}},{{ *}}v23.h):sat }
+v2.ub=vpack(v23.h,v23.h):sat
+
+#CHECK: 1fc7d464 { v4.h = vpacke(v20.w{{ *}},{{ *}}v7.w) }
+v4.h=vpacke(v20.w,v7.w)
+
+#CHECK: 1fc2c75b { v27.b = vpacke(v7.h{{ *}},{{ *}}v2.h) }
+v27.b=vpacke(v7.h,v2.h)
+
+#CHECK: 1fc9c5ed { v13.uh = vpack(v5.w{{ *}},{{ *}}v9.w):sat }
+v13.uh=vpack(v5.w,v9.w):sat
+
+#CHECK: 1ff1d81f { v31.h = vpack(v24.w{{ *}},{{ *}}v17.w):sat }
+v31.h=vpack(v24.w,v17.w):sat
+
+#CHECK: 1fe6c435 { v21.b = vpacko(v4.h{{ *}},{{ *}}v6.h) }
+v21.b=vpacko(v4.h,v6.h)
+
+#CHECK: 1febc140 { v0.h = vpacko(v1.w{{ *}},{{ *}}v11.w) }
+v0.h=vpacko(v1.w,v11.w)
+
+#CHECK: 1e01d256 { v23:22.h = vunpack(v18.b) }
+v23:22.h=vunpack(v18.b)
+
+#CHECK: 1e01cc38 { v25:24.uw = vunpack(v12.uh) }
+v25:24.uw=vunpack(v12.uh)
+
+#CHECK: 1e01c61e { v31:30.uh = vunpack(v6.ub) }
+v31:30.uh=vunpack(v6.ub)
+
+#CHECK: 1e01d778 { v25:24.w = vunpack(v23.h) }
+v25:24.w=vunpack(v23.h)
+
+#CHECK: 1e00c0e0 { v0.b = vdeal(v0.b) }
+v0.b=vdeal(v0.b)
+
+#CHECK: 1e00d5c9 { v9.h = vdeal(v21.h) }
+v9.h=vdeal(v21.h)
+
+#CHECK: 1e02cb1c { v28.b = vshuff(v11.b) }
+v28.b=vshuff(v11.b)
+
+#CHECK: 1e01d8fe { v30.h = vshuff(v24.h) }
+v30.h=vshuff(v24.h)
diff --git a/test/MC/Hexagon/v60-shift.s b/test/MC/Hexagon/v60-shift.s
new file mode 100644
index 000000000000..3d0c334debb9
--- /dev/null
+++ b/test/MC/Hexagon/v60-shift.s
@@ -0,0 +1,39 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 198fd829 { v9.uw = vlsr(v24.uw,{{ *}}r15) }
+v9.uw=vlsr(v24.uw,r15)
+
+#CHECK: 1999d645 { v5.uh = vlsr(v22.uh,{{ *}}r25) }
+v5.uh=vlsr(v22.uh,r25)
+
+#CHECK: 198cc303 { v3.h = vasl(v3.h,{{ *}}r12) }
+v3.h=vasl(v3.h,r12)
+
+#CHECK: 1965d7ac { v12.w = vasr(v23.w,{{ *}}r5) }
+v12.w=vasr(v23.w,r5)
+
+#CHECK: 197dddc3 { v3.h = vasr(v29.h,{{ *}}r29) }
+v3.h=vasr(v29.h,r29)
+
+#CHECK: 197adde8 { v8.w = vasl(v29.w,{{ *}}r26) }
+v8.w=vasl(v29.w,r26)
+
+#CHECK: 1977cc26 { v6 = vror(v12,{{ *}}r23) }
+v6=vror(v12,r23)
+
+#CHECK: 1e02cfad { v13.uw = vcl0(v15.uw) }
+v13.uw=vcl0(v15.uw)
+
+#CHECK: 1e02defb { v27.uh = vcl0(v30.uh) }
+v27.uh=vcl0(v30.uh)
+
+#CHECK: 1e03de90 { v16.w = vnormamt(v30.w) }
+v16.w=vnormamt(v30.w)
+
+#CHECK: 1e03d4a3 { v3.h = vnormamt(v20.h) }
+v3.h=vnormamt(v20.h)
+
+#CHECK: 1e02c2d8 { v24.h = vpopcount(v2.h) }
+v24.h=vpopcount(v2.h)
diff --git a/test/MC/Hexagon/v60-vcmp.s b/test/MC/Hexagon/v60-vcmp.s
new file mode 100644
index 000000000000..c7f4e128be63
--- /dev/null
+++ b/test/MC/Hexagon/v60-vcmp.s
@@ -0,0 +1,84 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1c81f142 { q2 |= vcmp.eq(v17.b{{ *}},{{ *}}v1.b) }
+q2|=vcmp.eq(v17.b,v1.b)
+
+#CHECK: 1c84fb2a { q2 &= vcmp.gt(v27.uw{{ *}},{{ *}}v4.uw) }
+q2&=vcmp.gt(v27.uw,v4.uw)
+
+#CHECK: 1c8cf826 { q2 &= vcmp.gt(v24.uh{{ *}},{{ *}}v12.uh) }
+q2&=vcmp.gt(v24.uh,v12.uh)
+
+#CHECK: 1c80e720 { q0 &= vcmp.gt(v7.ub{{ *}},{{ *}}v0.ub) }
+q0&=vcmp.gt(v7.ub,v0.ub)
+
+#CHECK: 1c9aed1a { q2 &= vcmp.gt(v13.w{{ *}},{{ *}}v26.w) }
+q2&=vcmp.gt(v13.w,v26.w)
+
+#CHECK: 1c8de516 { q2 &= vcmp.gt(v5.h{{ *}},{{ *}}v13.h) }
+q2&=vcmp.gt(v5.h,v13.h)
+
+#CHECK: 1c8dfc11 { q1 &= vcmp.gt(v28.b{{ *}},{{ *}}v13.b) }
+q1&=vcmp.gt(v28.b,v13.b)
+
+#CHECK: 1c94fa0b { q3 &= vcmp.eq(v26.w{{ *}},{{ *}}v20.w) }
+q3&=vcmp.eq(v26.w,v20.w)
+
+#CHECK: 1c83e206 { q2 &= vcmp.eq(v2.h{{ *}},{{ *}}v3.h) }
+q2&=vcmp.eq(v2.h,v3.h)
+
+#CHECK: 1c85e900 { q0 &= vcmp.eq(v9.b{{ *}},{{ *}}v5.b) }
+q0&=vcmp.eq(v9.b,v5.b)
+
+#CHECK: 1c9cfca8 { q0 ^= vcmp.gt(v28.uw{{ *}},{{ *}}v28.uw) }
+q0^=vcmp.gt(v28.uw,v28.uw)
+
+#CHECK: 1c81faa0 { q0 ^= vcmp.gt(v26.ub{{ *}},{{ *}}v1.ub) }
+q0^=vcmp.gt(v26.ub,v1.ub)
+
+#CHECK: 1c96f0a4 { q0 ^= vcmp.gt(v16.uh{{ *}},{{ *}}v22.uh) }
+q0^=vcmp.gt(v16.uh,v22.uh)
+
+#CHECK: 1c9bf795 { q1 ^= vcmp.gt(v23.h{{ *}},{{ *}}v27.h) }
+q1^=vcmp.gt(v23.h,v27.h)
+
+#CHECK: 1c9de698 { q0 ^= vcmp.gt(v6.w{{ *}},{{ *}}v29.w) }
+q0^=vcmp.gt(v6.w,v29.w)
+
+#CHECK: 1c82ef8a { q2 ^= vcmp.eq(v15.w{{ *}},{{ *}}v2.w) }
+q2^=vcmp.eq(v15.w,v2.w)
+
+#CHECK: 1c99e891 { q1 ^= vcmp.gt(v8.b{{ *}},{{ *}}v25.b) }
+q1^=vcmp.gt(v8.b,v25.b)
+
+#CHECK: 1c8afe55 { q1 |= vcmp.gt(v30.h{{ *}},{{ *}}v10.h) }
+q1|=vcmp.gt(v30.h,v10.h)
+
+#CHECK: 1c92ef50 { q0 |= vcmp.gt(v15.b{{ *}},{{ *}}v18.b) }
+q0|=vcmp.gt(v15.b,v18.b)
+
+#CHECK: 1c9ffb4b { q3 |= vcmp.eq(v27.w{{ *}},{{ *}}v31.w) }
+q3|=vcmp.eq(v27.w,v31.w)
+
+#CHECK: 1c87e944 { q0 |= vcmp.eq(v9.h{{ *}},{{ *}}v7.h) }
+q0|=vcmp.eq(v9.h,v7.h)
+
+#CHECK: 1c8ee768 { q0 |= vcmp.gt(v7.uw{{ *}},{{ *}}v14.uw) }
+q0|=vcmp.gt(v7.uw,v14.uw)
+
+#CHECK: 1c92e265 { q1 |= vcmp.gt(v2.uh{{ *}},{{ *}}v18.uh) }
+q1|=vcmp.gt(v2.uh,v18.uh)
+
+#CHECK: 1c80f062 { q2 |= vcmp.gt(v16.ub{{ *}},{{ *}}v0.ub) }
+q2|=vcmp.gt(v16.ub,v0.ub)
+
+#CHECK: 1c91f75a { q2 |= vcmp.gt(v23.w{{ *}},{{ *}}v17.w) }
+q2|=vcmp.gt(v23.w,v17.w)
+
+#CHECK: 1c86fe84 { q0 ^= vcmp.eq(v30.h{{ *}},{{ *}}v6.h) }
+q0^=vcmp.eq(v30.h,v6.h)
+
+#CHECK: 1c86ec82 { q2 ^= vcmp.eq(v12.b{{ *}},{{ *}}v6.b) }
+q2^=vcmp.eq(v12.b,v6.b)
diff --git a/test/MC/Hexagon/v60-vmem.s b/test/MC/Hexagon/v60-vmem.s
new file mode 100644
index 000000000000..fe202251ec4b
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmem.s
@@ -0,0 +1,424 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 292cc11b { vmem(r12++#1) = v27 }
+{
+  vmem(r12++#1)=v27
+}
+
+#CHECK: 294dc319 { v25 = vmem(r13++#3):nt }
+{
+  v25=vmem(r13++#3):nt
+}
+
+#CHECK: 2904c1fb { v27 = vmemu(r4++#1) }
+{
+  v27=vmemu(r4++#1)
+}
+
+#CHECK: 291dc01f { v31 = vmem(r29++#0) }
+{
+  v31=vmem(r29++#0)
+}
+
+#CHECK: 293ec0ff { vmemu(r30++#0) = v31 }
+{
+  vmemu(r30++#0)=v31
+}
+
+#CHECK: 296ec411 { vmem(r14++#-4):nt = v17 }
+{
+  vmem(r14++#-4):nt=v17
+}
+
+#CHECK: 29fec62f { if (!p0) vmem(r30++#-2):nt = v15 }
+{
+  if (!p0) vmem(r30++#-2):nt=v15
+}
+
+#CHECK: 29f9c914 { if (p1) vmem(r25++#1):nt = v20 }
+{
+  if (p1) vmem(r25++#1):nt=v20
+}
+
+#CHECK: 2984de30 { if (!q3) vmem(r4++#-2) = v16 }
+{
+  if (!q3) vmem(r4++#-2)=v16
+}
+
+#CHECK: 2992dd1f { if (q3) vmem(r18++#-3) = v31 }
+{
+  if (q3) vmem(r18++#-3)=v31
+}
+
+#CHECK: 29c9c425 { if (!q0) vmem(r9++#-4):nt = v5 }
+{
+  if (!q0) vmem(r9++#-4):nt=v5
+}
+
+#CHECK: 29d1cf11 { if (q1) vmem(r17++#-1):nt = v17 }
+{
+  if (q1) vmem(r17++#-1):nt=v17
+}
+
+#CHECK: 29a7c328 { if (!p0) vmem(r7++#3) = v8 }
+{
+  if (!p0) vmem(r7++#3)=v8
+}
+
+#CHECK: 29b6cc1d { if (p1) vmem(r22++#-4) = v29 }
+{
+  if (p1) vmem(r22++#-4)=v29
+}
+
+#CHECK: 29abc5fe { if (!p0) vmemu(r11++#-3) = v30 }
+{
+  if (!p0) vmemu(r11++#-3)=v30
+}
+
+#CHECK: 29b8d5c4 { if (p2) vmemu(r24++#-3) = v4 }
+{
+  if (p2) vmemu(r24++#-3)=v4
+}
+
+#CHECK: 2860e407 { vmem(r0+#-4):nt = v7 }
+{
+  vmem(r0+#-4):nt=v7
+}
+
+#CHECK: 2830e2e7 { vmemu(r16+#-6) = v7 }
+{
+  vmemu(r16+#-6)=v7
+}
+
+#CHECK: 2839c316 { vmem(r25+#3) = v22 }
+{
+  vmem(r25+#3)=v22
+}
+#CHECK: 284be316 { v22 = vmem(r11+#-5):nt }
+{
+  v22=vmem(r11+#-5):nt
+}
+
+#CHECK: 280ec1e6 { v6 = vmemu(r14+#1) }
+{
+  v6=vmemu(r14+#1)
+}
+
+#CHECK: 280ae50c { v12 = vmem(r10+#-3) }
+{
+  v12=vmem(r10+#-3)
+}
+
+#CHECK: 2b62e005 { vmem(r2++m1):nt = v5 }
+{
+  vmem(r2++m1):nt=v5
+}
+
+#CHECK: 2b28e0f2 { vmemu(r8++m1) = v18 }
+{
+  vmemu(r8++m1)=v18
+}
+
+#CHECK: 2b42e019 { v25 = vmem(r2++m1):nt }
+{
+  v25=vmem(r2++m1):nt
+}
+
+#CHECK: 2b2ce009 { vmem(r12++m1) = v9 }
+{
+  vmem(r12++m1)=v9
+}
+
+#CHECK: 2b03c005 { v5 = vmem(r3++m0) }
+{
+  v5=vmem(r3++m0)
+}
+
+
+#CHECK: 2b0ec0f5 { v21 = vmemu(r14++m0) }
+{
+  v21=vmemu(r14++m0)
+}
+
+#CHECK: 2be8c022 { if (!p0) vmem(r8++m0):nt = v2 }
+{
+  if (!p0) vmem(r8++m0):nt=v2
+}
+
+#CHECK: 2bebd813 { if (p3) vmem(r11++m0):nt = v19 }
+{
+  if (p3) vmem(r11++m0):nt=v19
+}
+
+#CHECK: 2ba5e0e7 { if (!p0) vmemu(r5++m1) = v7 }
+{
+  if (!p0) vmemu(r5++m1)=v7
+}
+
+#CHECK: 2ba4f0dd { if (p2) vmemu(r4++m1) = v29 }
+{
+  if (p2) vmemu(r4++m1)=v29
+}
+
+#CHECK: 2ba4e828 { if (!p1) vmem(r4++m1) = v8 }
+{
+  if (!p1) vmem(r4++m1)=v8
+}
+
+#CHECK: 2bbae803 { if (p1) vmem(r26++m1) = v3 }
+{
+  if (p1) vmem(r26++m1)=v3
+}
+
+#CHECK: 2bc9c027 { if (!q0) vmem(r9++m0):nt = v7 }
+{
+  if (!q0) vmem(r9++m0):nt=v7
+}
+
+#CHECK: 2bcfc001 { if (q0) vmem(r15++m0):nt = v1 }
+{
+  if (q0) vmem(r15++m0):nt=v1
+}
+
+#CHECK: 2b97f031 { if (!q2) vmem(r23++m1) = v17 }
+{
+  if (!q2) vmem(r23++m1)=v17
+}
+
+#CHECK: 2b8ad809 { if (q3) vmem(r10++m0) = v9 }
+{
+  if (q3) vmem(r10++m0)=v9
+}
+
+#CHECK: 28c7f438 { if (!q2) vmem(r7+#-4):nt = v24 }
+{
+  if (!q2) vmem(r7+#-4):nt=v24
+}
+
+#CHECK: 28d1eb15 { if (q1) vmem(r17+#-5):nt = v21 }
+{
+  if (q1) vmem(r17+#-5):nt=v21
+}
+
+#CHECK: 289cfe2b { if (!q3) vmem(r28+#-2) = v11 }
+{
+  if (!q3) vmem(r28+#-2)=v11
+}
+
+#CHECK: 288eef0f { if (q1) vmem(r14+#-1) = v15 }
+{
+  if (q1) vmem(r14+#-1)=v15
+}
+
+#CHECK: 28a2d1e1 { if (!p2) vmemu(r2+#1) = v1 }
+{
+  if (!p2) vmemu(r2+#1)=v1
+}
+
+#CHECK: 28bcf4db { if (p2) vmemu(r28+#-4) = v27 }
+{
+  if (p2) vmemu(r28+#-4)=v27
+}
+
+#CHECK: 28b2c925 { if (!p1) vmem(r18+#1) = v5 }
+{
+  if (!p1) vmem(r18+#1)=v5
+}
+
+#CHECK: 28afe41a { if (p0) vmem(r15+#-4) = v26 }
+{
+  if (p0) vmem(r15+#-4)=v26
+}
+
+#CHECK: 28f7fd3a { if (!p3) vmem(r23+#-3):nt = v26 }
+{
+  if (!p3) vmem(r23+#-3):nt=v26
+}
+
+#CHECK: 28f5fd10 { if (p3) vmem(r21+#-3):nt = v16 }
+{
+  if (p3) vmem(r21+#-3):nt=v16
+}
+
+#CHECK: 2945c440 v0.tmp = vmem(r5++#-4):nt }
+{
+  v0.tmp=vmem(r5++#-4):nt
+  v26=v0
+}
+
+#CHECK: 2942c338 v24.cur = vmem(r2++#3):nt }
+{
+  v24.cur=vmem(r2++#3):nt
+  v6=v24
+}
+
+#CHECK: 2908c157 v23.tmp = vmem(r8++#1) }
+{
+  v25=v23
+  v23.tmp=vmem(r8++#1)
+}
+
+#CHECK: 2903c72d v13.cur = vmem(r3++#-1) }
+{
+  v13.cur=vmem(r3++#-1)
+  v21=v13
+}
+
+#CHECK: 2855c743 v3.tmp = vmem(r21+#7):nt }
+{
+  v3.tmp=vmem(r21+#7):nt
+  v21=v3
+}
+
+#CHECK: 2856e025 v5.cur = vmem(r22+#-8):nt }
+{
+  v5.cur=vmem(r22+#-8):nt
+  v29=v5
+}
+
+#CHECK: 2802c555 v21.tmp = vmem(r2+#5) }
+{
+  v31=v21
+  v21.tmp=vmem(r2+#5)
+}
+
+#CHECK: 2814e12a v10.cur = vmem(r20+#-7) }
+{
+  v9=v10
+  v10.cur=vmem(r20+#-7)
+}
+
+
+#CHECK: 2b52c02c v12.cur = vmem(r18++m0):nt }
+{
+  v12.cur=vmem(r18++m0):nt
+  v25=v12
+}
+
+#CHECK: 2b4ae043 v3.tmp = vmem(r10++m1):nt }
+{
+  v25=v3
+  v3.tmp=vmem(r10++m1):nt
+}
+
+#CHECK: 2b06c025 v5.cur = vmem(r6++m0) }
+{
+  v5.cur=vmem(r6++m0)
+  v10=v5
+}
+
+#CHECK: 2b17e048 v8.tmp = vmem(r23++m1) }
+{
+  v8.tmp=vmem(r23++m1)
+  v28=v8
+}
+
+#CHECK: 282ee422 vmem(r14+#-4) = v14.new }
+{
+  v14 = v14
+  vmem(r14+#-4)=v14.new
+}
+
+#CHECK: 2866e222 vmem(r6+#-6):nt = v16.new }
+{
+  v16 = v8
+  vmem(r6+#-6):nt=v16.new
+}
+
+#CHECK: 28b1cd42 if(p1) vmem(r17+#5) = v17.new }
+{
+  v17 = v25
+  if(p1)vmem(r17+#5)=v17.new
+}
+
+#CHECK: 28bbeb6a if(!p1) vmem(r27+#-5) = v17.new }
+{
+  v17 = v15
+  if(!p1)vmem(r27+#-5)=v17.new
+}
+
+#CHECK: 28e4d252 if(p2) vmem(r4+#2):nt = v24.new }
+{
+  v24 = v10
+  if(p2)vmem(r4+#2):nt=v24.new
+}
+
+#CHECK: 28f8d17a if(!p2) vmem(r24+#1):nt = v4.new }
+{
+  v4 = v8
+  if(!p2)vmem(r24+#1):nt=v4.new
+}
+
+#CHECK: 2924c322 vmem(r4++#3) = v4.new }
+{
+  v4 = v3
+  vmem(r4++#3)=v4.new
+}
+
+#CHECK: 2961c122 vmem(r1++#1):nt = v7.new }
+{
+  v7 = v8
+  vmem(r1++#1):nt=v7.new
+}
+
+#CHECK: 29a6d042 if(p2) vmem(r6++#0) = v11.new }
+{
+  v11 = v13
+  if(p2)vmem(r6++#0)=v11.new
+}
+
+#CHECK: 29a2cb6a if(!p1) vmem(r2++#3) = v25.new }
+{
+  v25 = v17
+  if(!p1)vmem(r2++#3)=v25.new
+}
+
+#CHECK: 29f5c952 if(p1) vmem(r21++#1):nt = v14.new }
+{
+  v14 = v13
+  if(p1)vmem(r21++#1):nt=v14.new
+}
+
+#CHECK: 29f7cd7a if(!p1) vmem(r23++#-3):nt = v1.new }
+{
+  v1 = v0
+  if(!p1)vmem(r23++#-3):nt=v1.new
+}
+
+#CHECK: 2b3ec022 vmem(r30++m0) = v10.new }
+{
+  v10 = v23
+  vmem(r30++m0)=v10.new
+}
+
+#CHECK: 2b6fc022 vmem(r15++m0):nt = v19.new }
+{
+  v19 = v20
+  vmem(r15++m0):nt=v19.new
+}
+
+#CHECK: 2bb7f042 if(p2) vmem(r23++m1) = v6.new }
+{
+  v6 = v30
+  if(p2)vmem(r23++m1)=v6.new
+}
+
+#CHECK: 2ba2f06a if(!p2) vmem(r2++m1) = v12.new }
+{
+  v12 = v9
+  if(!p2)vmem(r2++m1)=v12.new
+}
+
+#CHECK: 2be7e852 if(p1) vmem(r7++m1):nt = v3.new }
+{
+  v3 = v13
+  if(p1)vmem(r7++m1):nt=v3.new
+}
+
+#CHECK: 2bfdd07a if(!p2) vmem(r29++m0):nt = v29.new }
+{
+  v29 = v9
+  if(!p2)vmem(r29++m0):nt=v29.new
+}
diff --git a/test/MC/Hexagon/v60-vmpy-acc.s b/test/MC/Hexagon/v60-vmpy-acc.s
new file mode 100644
index 000000000000..c39a9252b563
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmpy-acc.s
@@ -0,0 +1,123 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1936ee37 { v23.w += vdmpy(v15:14.h,r22.uh,#1):sat }
+v23.w += vdmpy(v15:14.h,r22.uh,#1):sat
+
+#CHECK: 193bf90f { v15.w += vdmpy(v25.h,r27.uh):sat }
+v15.w += vdmpy(v25.h,r27.uh):sat
+
+#CHECK: 1902fcf0 { v17:16.h += vdmpy(v29:28.ub,r2.b) }
+v17:16.h += vdmpy(v29:28.ub,r2.b)
+
+#CHECK: 190cffd1 { v17.h += vdmpy(v31.ub,r12.b) }
+v17.h += vdmpy(v31.ub,r12.b)
+
+#CHECK: 1900f5ac { v12.w += vrmpy(v21.ub,r0.b) }
+v12.w += vrmpy(v21.ub,r0.b)
+
+#CHECK: 1905fb86 { v6.uw += vrmpy(v27.ub,r5.ub) }
+v6.uw += vrmpy(v27.ub,r5.ub)
+
+#CHECK: 191de570 { v16.w += vdmpy(v5.h,r29.b) }
+v16.w += vdmpy(v5.h,r29.b)
+
+#CHECK: 191de846 { v7:6.w += vtmpy(v9:8.h,r29.b) }
+v7:6.w += vtmpy(v9:8.h,r29.b)
+
+#CHECK: 190bfa22 { v3:2.h += vtmpy(v27:26.ub,r11.b) }
+v3:2.h += vtmpy(v27:26.ub,r11.b)
+
+#CHECK: 1915e408 { v9:8.h += vtmpy(v5:4.b,r21.b) }
+v9:8.h += vtmpy(v5:4.b,r21.b)
+
+#CHECK: 1987f71e { v31:30.uh += vmpy(v23.ub,r7.ub) }
+v31:30.uh += vmpy(v23.ub,r7.ub)
+
+#CHECK: 1969ff47 { v7.w += vasl(v31.w,r9) }
+v7.w += vasl(v31.w,r9)
+
+#CHECK: 196de3b0 { v16.w += vasr(v3.w,r13) }
+v16.w += vasr(v3.w,r13)
+
+#CHECK: 1977fe0a { v11:10.uw += vdsad(v31:30.uh,r23.uh) }
+v11:10.uw += vdsad(v31:30.uh,r23.uh)
+
+#CHECK: 196eee36 { v22.h += vmpyi(v14.h,r14.b) }
+v22.h += vmpyi(v14.h,r14.b)
+
+#CHECK: 1931faac { v13:12.h += vmpy(v26.ub,r17.b) }
+v13:12.h += vmpy(v26.ub,r17.b)
+
+#CHECK: 193cfc94 { v21:20.w += vdmpy(v29:28.h,r28.b) }
+v21:20.w += vdmpy(v29:28.h,r28.b)
+
+#CHECK: 1934fc62 { v2.w += vdmpy(v28.h,r20.h):sat }
+v2.w += vdmpy(v28.h,r20.h):sat
+
+#CHECK: 1925fe5f { v31.w += vdmpy(v31:30.h,r5.h):sat }
+v31.w += vdmpy(v31:30.h,r5.h):sat
+
+#CHECK: 194efe36 { v23:22.uw += vmpy(v30.uh,r14.uh) }
+v23:22.uw += vmpy(v30.uh,r14.uh)
+
+#CHECK: 1948e306 { v7:6.w += vmpy(v3.h,r8.h):sat }
+v7:6.w += vmpy(v3.h,r8.h):sat
+
+#CHECK: 192af2f8 { v25:24.w += vmpa(v19:18.h,r10.b) }
+v25:24.w += vmpa(v19:18.h,r10.b)
+
+#CHECK: 1926e4da { v27:26.h += vmpa(v5:4.ub,r6.b) }
+v27:26.h += vmpa(v5:4.ub,r6.b)
+
+#CHECK: 194ff078 { v24.w += vmpyi(v16.w,r15.h) }
+v24.w += vmpyi(v16.w,r15.h)
+
+#CHECK: 1946e247 { v7.w += vmpyi(v2.w,r6.b) }
+v7.w += vmpyi(v2.w,r6.b)
+
+#CHECK: 1c3fead5 { v21.w += vmpyo(v10.w,v31.h):<<1:sat:shift }
+v21.w += vmpyo(v10.w,v31.h):<<1:sat:shift
+
+#CHECK: 1c30e1fa { v26.w += vmpyo(v1.w,v16.h):<<1:rnd:sat:shift }
+v26.w += vmpyo(v1.w,v16.h):<<1:rnd:sat:shift
+
+#CHECK: 1c34f690 { v16.h += vmpyi(v22.h,v20.h) }
+v16.h += vmpyi(v22.h,v20.h)
+
+#CHECK: 1c34f4b5 { v21.w += vmpyie(v20.w,v20.uh) }
+v21.w += vmpyie(v20.w,v20.uh)
+
+#CHECK: 1c54f804 { v4.w += vmpyie(v24.w,v20.h) }
+v4.w += vmpyie(v24.w,v20.h)
+
+#CHECK: 1c1ff6f4 { v21:20.w += vmpy(v22.h,v31.h) }
+v21:20.w += vmpy(v22.h,v31.h)
+
+#CHECK: 1c31f026 { v7:6.w += vmpy(v16.h,v17.uh) }
+v7:6.w += vmpy(v16.h,v17.uh)
+
+#CHECK: 1c12fb98 { v25:24.h += vmpy(v27.b,v18.b) }
+v25:24.h += vmpy(v27.b,v18.b)
+
+#CHECK: 1c17fcc0 { v1:0.h += vmpy(v28.ub,v23.b) }
+v1:0.h += vmpy(v28.ub,v23.b)
+
+#CHECK: 1c16f26f { v15.w += vdmpy(v18.h,v22.h):sat }
+v15.w += vdmpy(v18.h,v22.h):sat
+
+#CHECK: 1c0bea3a { v26.w += vrmpy(v10.b,v11.b) }
+v26.w += vrmpy(v10.b,v11.b)
+
+#CHECK: 1c15eb47 { v7.w += vrmpy(v11.ub,v21.b) }
+v7.w += vrmpy(v11.ub,v21.b)
+
+#CHECK: 1c26e40e { v15:14.uw += vmpy(v4.uh,v6.uh) }
+v15:14.uw += vmpy(v4.uh,v6.uh)
+
+#CHECK: 1c0df9a8 { v9:8.uh += vmpy(v25.ub,v13.ub) }
+v9:8.uh += vmpy(v25.ub,v13.ub)
+
+#CHECK: 1c0afc15 { v21.uw += vrmpy(v28.ub,v10.ub) }
+v21.uw += vrmpy(v28.ub,v10.ub)
diff --git a/test/MC/Hexagon/v60-vmpy1.s b/test/MC/Hexagon/v60-vmpy1.s
new file mode 100644
index 000000000000..1f36a5e95ddb
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmpy1.s
@@ -0,0 +1,138 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1939c223 { v3.w = vdmpy(v3:2.h,{{ *}}r25.uh,{{ *}}#1):sat }
+v3.w=vdmpy(v3:2.h,r25.uh,#1):sat
+
+#CHECK: 1936de0d { v13.w = vdmpy(v30.h,{{ *}}r22.uh):sat }
+v13.w=vdmpy(v30.h,r22.uh):sat
+
+#CHECK: 1919ccea { v11:10.h = vdmpy(v13:12.ub,{{ *}}r25.b) }
+v11:10.h=vdmpy(v13:12.ub,r25.b)
+
+#CHECK: 1918ced6 { v22.h = vdmpy(v14.ub,{{ *}}r24.b) }
+v22.h=vdmpy(v14.ub,r24.b)
+
+#CHECK: 1911deba { v27:26.uw = vdsad(v31:30.uh,{{ *}}r17.uh) }
+v27:26.uw=vdsad(v31:30.uh,r17.uh)
+
+#CHECK: 1908da97 { v23.w = vrmpy(v26.ub,{{ *}}r8.b) }
+v23.w=vrmpy(v26.ub,r8.b)
+
+#CHECK: 1915c974 { v20.uw = vrmpy(v9.ub,{{ *}}r21.ub) }
+v20.uw=vrmpy(v9.ub,r21.ub)
+
+#CHECK: 190dd446 { v6.w = vdmpy(v20.h,{{ *}}r13.b) }
+v6.w=vdmpy(v20.h,r13.b)
+
+#CHECK: 190ec030 { v17:16.h = vtmpy(v1:0.ub,{{ *}}r14.b) }
+v17:16.h=vtmpy(v1:0.ub,r14.b)
+
+#CHECK: 1918de1c { v29:28.h = vtmpy(v31:30.b,{{ *}}r24.b) }
+v29:28.h=vtmpy(v31:30.b,r24.b)
+
+#CHECK: 198dddf1 { v17.w = vmpyi(v29.w,{{ *}}r13.h) }
+v17.w=vmpyi(v29.w,r13.h)
+
+#CHECK: 19bccb13 { v19.w = vmpyi(v11.w,{{ *}}r28.b) }
+v19.w=vmpyi(v11.w,r28.b)
+
+#CHECK: 19c8cb0a { v11:10.uh = vmpy(v11.ub,{{ *}}r8.ub) }
+v11:10.uh=vmpy(v11.ub,r8.ub)
+
+#CHECK: 1973d012 { v18.h = vmpyi(v16.h,{{ *}}r19.b) }
+v18.h=vmpyi(v16.h,r19.b)
+
+#CHECK: 1922d1aa { v11:10.h = vmpy(v17.ub,{{ *}}r2.b) }
+v11:10.h=vmpy(v17.ub,r2.b)
+
+#CHECK: 1936ce9c { v29:28.w = vdmpy(v15:14.h,{{ *}}r22.b) }
+v29:28.w=vdmpy(v15:14.h,r22.b)
+
+#CHECK: 1925d86b { v11.w = vdmpy(v25:24.h,{{ *}}r5.h):sat }
+v11.w=vdmpy(v25:24.h,r5.h):sat
+
+#CHECK: 1925c255 { v21.w = vdmpy(v2.h,{{ *}}r5.h):sat }
+v21.w=vdmpy(v2.h,r5.h):sat
+
+#CHECK: 1941d424 { v4.h = vmpy(v20.h,{{ *}}r1.h):<<1:sat }
+v4.h=vmpy(v20.h,r1.h):<<1:sat
+
+#CHECK: 1943cf0a { v11:10.w = vmpy(v15.h,{{ *}}r3.h) }
+v11:10.w=vmpy(v15.h,r3.h)
+
+#CHECK: 193ec2f0 { v17:16.w = vmpa(v3:2.h,{{ *}}r30.b) }
+v17:16.w=vmpa(v3:2.h,r30.b)
+
+#CHECK: 193ddcde { v31:30.h = vmpa(v29:28.ub,{{ *}}r29.b) }
+v31:30.h=vmpa(v29:28.ub,r29.b)
+
+#CHECK: 1946de76 { v23:22.uw = vmpy(v30.uh,{{ *}}r6.uh) }
+v23:22.uw=vmpy(v30.uh,r6.uh)
+
+#CHECK: 1945c945 { v5.h = vmpy(v9.h,{{ *}}r5.h):<<1:rnd:sat }
+v5.h=vmpy(v9.h,r5.h):<<1:rnd:sat
+
+#CHECK: 19b0c280 { v1:0.w = vtmpy(v3:2.h,{{ *}}r16.b) }
+v1:0.w=vtmpy(v3:2.h,r16.b)
+
+#CHECK: 1c34d937 { v23.h = vmpy(v25.h,{{ *}}v20.h):<<1:rnd:sat }
+v23.h=vmpy(v25.h,v20.h):<<1:rnd:sat
+
+#CHECK: 1c36c90a { v11:10.uw = vmpy(v9.uh,{{ *}}v22.uh) }
+v11:10.uw=vmpy(v9.uh,v22.uh)
+
+#CHECK: 1c09c3ec { v13:12.w = vmpy(v3.h,{{ *}}v9.h) }
+v13:12.w=vmpy(v3.h,v9.h)
+
+#CHECK: 1c0dd1d8 { v25:24.h = vmpy(v17.ub,{{ *}}v13.b) }
+v25:24.h=vmpy(v17.ub,v13.b)
+
+#CHECK: 1c0dc0a4 { v5:4.uh = vmpy(v0.ub,{{ *}}v13.ub) }
+v5:4.uh=vmpy(v0.ub,v13.ub)
+
+#CHECK: 1c14df84 { v5:4.h = vmpy(v31.b,{{ *}}v20.b) }
+v5:4.h=vmpy(v31.b,v20.b)
+
+#CHECK: 1c16d77c { v28.w = vdmpy(v23.h,{{ *}}v22.h):sat }
+v28.w=vdmpy(v23.h,v22.h):sat
+
+#CHECK: 1c08d84f { v15.w = vrmpy(v24.ub,{{ *}}v8.b) }
+v15.w=vrmpy(v24.ub,v8.b)
+
+#CHECK: 1c06da29 { v9.w = vrmpy(v26.b,{{ *}}v6.b) }
+v9.w=vrmpy(v26.b,v6.b)
+
+#CHECK: 1c1ac805 { v5.uw = vrmpy(v8.ub,{{ *}}v26.ub) }
+v5.uw=vrmpy(v8.ub,v26.ub)
+
+#CHECK: 1c39d089 { v9.h = vmpyi(v16.h,{{ *}}v25.h) }
+v9.h=vmpyi(v16.h,v25.h)
+
+#CHECK: 1c3ecc64 { v5:4.h = vmpa(v13:12.ub,{{ *}}v31:30.b) }
+v5:4.h=vmpa(v13:12.ub,v31:30.b)
+
+#CHECK: 1c21ce54 { v21:20.w = vmpy(v14.h,{{ *}}v1.uh) }
+v21:20.w=vmpy(v14.h,v1.uh)
+
+#CHECK: 1cf2c6f0 { v17:16.h = vmpa(v7:6.ub,{{ *}}v19:18.ub) }
+v17:16.h=vmpa(v7:6.ub,v19:18.ub)
+
+#CHECK: 1fcdc82b { v11.w = vmpyio(v8.w{{ *}},{{ *}}v13.h) }
+v11.w=vmpyio(v8.w,v13.h)
+
+#CHECK: 1fdeda10 { v16.w = vmpyie(v26.w{{ *}},{{ *}}v30.uh) }
+v16.w=vmpyie(v26.w,v30.uh)
+
+#CHECK: 1ff2c2a6 { v6.w = vmpye(v2.w{{ *}},{{ *}}v18.uh) }
+v6.w=vmpye(v2.w,v18.uh)
+
+#CHECK: 1ff7cbfa { v26.w = vmpyo(v11.w{{ *}},{{ *}}v23.h):<<1:sat }
+v26.w=vmpyo(v11.w,v23.h):<<1:sat
+
+#CHECK: 1f5cd411 { v17.w = vmpyo(v20.w{{ *}},{{ *}}v28.h):<<1:rnd:sat }
+v17.w=vmpyo(v20.w,v28.h):<<1:rnd:sat
+
+#CHECK: 1f71cf1d { v29.w = vmpyieo(v15.h{{ *}},{{ *}}v17.h) }
+v29.w=vmpyieo(v15.h,v17.h)
diff --git a/test/MC/Hexagon/v60lookup.s b/test/MC/Hexagon/v60lookup.s
new file mode 100644
index 000000000000..b92a2d3c6eb1
--- /dev/null
+++ b/test/MC/Hexagon/v60lookup.s
@@ -0,0 +1,14 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+                    V31.b = vlut32(V29.b, V15.b, R1)
+# CHECK: 1b79fd3f { v31.b = vlut32(v29.b,v15.b,r1) }
+                    V31.b |= vlut32(V29.b, V15.b, R2)
+# CHECK: 1b7afdbf { v31.b |= vlut32(v29.b,v15.b,r2) }
+                    V31:30.h = vlut16(V29.b, V15.h, R3)
+# CHECK: 1b7bfdde { v31:30.h = vlut16(v29.b,v15.h,r3) }
+                    v31:30.h |= vlut16(v2.b, v9.h, r4)
+# CHECK: 1b4ce2fe { v31:30.h |= vlut16(v2.b,v9.h,r4) }
+                    v31.w = vinsert(r4)
+# CHECK: 19a4e03f { v31.w = vinsert(r4) }
diff --git a/test/MC/MachO/empty-twice.ll b/test/MC/MachO/empty-twice.ll
new file mode 100644
index 000000000000..6914c73a58d1
--- /dev/null
+++ b/test/MC/MachO/empty-twice.ll
@@ -0,0 +1,12 @@
+; Check that there is no persistent state in the MachO emitter that crashes
+; us when reusing the pass manager.
+; RUN: llc -mtriple=x86_64-apple-darwin -compile-twice -filetype=obj %s -o -
+
+; Force the creation of a DWARF section
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: true)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/MC/Mips/dsp/valid.s b/test/MC/Mips/dsp/valid.s
new file mode 100644
index 000000000000..804669c5e464
--- /dev/null
+++ b/test/MC/Mips/dsp/valid.s
@@ -0,0 +1,127 @@
+# RUN: llvm-mc -show-encoding -triple=mips-unknown-unknown -mattr=dsp %s | FileCheck %s
+#
+# CHECK:   .text
+  .set noat
+  absq_s.ph         $1, $2          # CHECK: absq_s.ph          $1, $2          # encoding: [0x7c,0x02,0x0a,0x52]
+  absq_s.w          $5, $6          # CHECK: absq_s.w           $5, $6          # encoding: [0x7c,0x06,0x2c,0x52]
+  addq.ph           $7, $8, $9      # CHECK: addq.ph            $7, $8, $9      # encoding: [0x7d,0x09,0x3a,0x90]
+  addq_s.ph         $10, $11, $12   # CHECK: addq_s.ph          $10, $11, $12   # encoding: [0x7d,0x6c,0x53,0x90]
+  addq_s.w          $13, $14, $15   # CHECK: addq_s.w           $13, $14, $15   # encoding: [0x7d,0xcf,0x6d,0x90]
+  addsc             $gp, $sp, $fp   # CHECK: addsc              $gp, $sp, $fp   # encoding: [0x7f,0xbe,0xe4,0x10]
+  addu.qb           $6, $7, $8      # CHECK: addu.qb            $6, $7, $8      # encoding: [0x7c,0xe8,0x30,0x10]
+  addu_s.qb         $9, $10, $11    # CHECK: addu_s.qb          $9, $10, $11    # encoding: [0x7d,0x4b,0x49,0x10]
+  addwc             $12, $13, $14   # CHECK: addwc              $12, $13, $14   # encoding: [0x7d,0xae,0x64,0x50]
+  bitrev            $25, $26        # CHECK: bitrev             $25, $26        # encoding: [0x7c,0x1a,0xce,0xd2]
+  bposge32          21100           # CHECK: bposge32           21100           # encoding: [0x04,0x1c,0x14,0x9b]
+  cmp.eq.ph         $27, $gp        # CHECK: cmp.eq.ph          $27, $gp        # encoding: [0x7f,0x7c,0x02,0x11]
+  cmp.lt.ph         $sp, $fp        # CHECK: cmp.lt.ph          $sp, $fp        # encoding: [0x7f,0xbe,0x02,0x51]
+  cmp.le.ph         $ra, $1         # CHECK: cmp.le.ph          $ra, $1         # encoding: [0x7f,0xe1,0x02,0x91]
+  cmpgu.eq.qb       $11, $12, $13   # CHECK: cmpgu.eq.qb        $11, $12, $13   # encoding: [0x7d,0x8d,0x59,0x11]
+  cmpgu.lt.qb       $14, $15, $16   # CHECK: cmpgu.lt.qb        $14, $15, $16   # encoding: [0x7d,0xf0,0x71,0x51]
+  cmpgu.le.qb       $17, $18, $19   # CHECK: cmpgu.le.qb        $17, $18, $19   # encoding: [0x7e,0x53,0x89,0x91]
+  cmpu.eq.qb        $20, $21        # CHECK: cmpu.eq.qb         $20, $21        # encoding: [0x7e,0x95,0x00,0x11]
+  cmpu.lt.qb        $22, $23        # CHECK: cmpu.lt.qb         $22, $23        # encoding: [0x7e,0xd7,0x00,0x51]
+  cmpu.le.qb        $24, $25        # CHECK: cmpu.le.qb         $24, $25        # encoding: [0x7f,0x19,0x00,0x91]
+  dpaq_s.w.ph       $ac1, $1, $2    # CHECK: dpaq_s.w.ph        $ac1, $1, $2    # encoding: [0x7c,0x22,0x09,0x30]
+  dpaq_sa.l.w       $ac2, $3, $4    # CHECK: dpaq_sa.l.w        $ac2, $3, $4    # encoding: [0x7c,0x64,0x13,0x30]
+  dpau.h.qbl        $ac1, $9, $10   # CHECK: dpau.h.qbl         $ac1, $9, $10   # encoding: [0x7d,0x2a,0x08,0xf0]
+  dpau.h.qbr        $ac1, $11, $12  # CHECK: dpau.h.qbr         $ac1, $11, $12  # encoding: [0x7d,0x6c,0x09,0xf0]
+  dpsq_s.w.ph       $ac0, $17, $18  # CHECK: dpsq_s.w.ph        $ac0, $17, $18  # encoding: [0x7e,0x32,0x01,0x70]
+  dpsq_sa.l.w       $ac1, $19, $20  # CHECK: dpsq_sa.l.w        $ac1, $19, $20  # encoding: [0x7e,0x74,0x0b,0x70]
+  dpsu.h.qbl        $ac0, $5, $6    # CHECK: dpsu.h.qbl         $ac0, $5, $6    # encoding: [0x7c,0xa6,0x02,0xf0]
+  dpsu.h.qbr        $ac1, $7, $8    # CHECK: dpsu.h.qbr         $ac1, $7, $8    # encoding: [0x7c,0xe8,0x0b,0xf0]
+  extp              $1, $ac0, 31    # CHECK: extp               $1, $ac0, 31    # encoding: [0x7f,0xe1,0x00,0xb8]
+  extpdp            $2, $ac1, 0     # CHECK: extpdp             $2, $ac1, 0     # encoding: [0x7c,0x02,0x0a,0xb8]
+  extpdpv           $3, $ac2, $4    # CHECK: extpdpv            $3, $ac2, $4    # encoding: [0x7c,0x83,0x12,0xf8]
+  extpv             $5, $ac3, $6    # CHECK: extpv              $5, $ac3, $6    # encoding: [0x7c,0xc5,0x18,0xf8]
+  extr.w            $7, $ac0, 31    # CHECK: extr.w             $7, $ac0, 31    # encoding: [0x7f,0xe7,0x00,0x38]
+  extr_r.w          $8, $ac1, 15    # CHECK: extr_r.w           $8, $ac1, 15    # encoding: [0x7d,0xe8,0x09,0x38]
+  extr_rs.w         $9, $ac2, 7     # CHECK: extr_rs.w          $9, $ac2, 7     # encoding: [0x7c,0xe9,0x11,0xb8]
+  extr_s.h          $10, $ac3, 3    # CHECK: extr_s.h           $10, $ac3, 3    # encoding: [0x7c,0x6a,0x1b,0xb8]
+  extrv.w           $11, $ac0, $12  # CHECK: extrv.w            $11, $ac0, $12  # encoding: [0x7d,0x8b,0x00,0x78]
+  extrv_r.w         $13, $ac1, $14  # CHECK: extrv_r.w          $13, $ac1, $14  # encoding: [0x7d,0xcd,0x09,0x78]
+  extrv_rs.w        $15, $ac2, $16  # CHECK: extrv_rs.w         $15, $ac2, $16  # encoding: [0x7e,0x0f,0x11,0xf8]
+  extrv_s.h         $17, $ac3, $18  # CHECK: extrv_s.h          $17, $ac3, $18  # encoding: [0x7e,0x51,0x1b,0xf8]
+  insv              $19, $20        # CHECK: insv               $19, $20        # encoding: [0x7e,0x93,0x00,0x0c]
+  lbux              $10, $20($26)   # CHECK: lbux               $10, $20($26)   # encoding: [0x7f,0x54,0x51,0x8a]
+  lhx               $11, $21($27)   # CHECK: lhx                $11, $21($27)   # encoding: [0x7f,0x75,0x59,0x0a]
+  lwx               $12, $22($gp)   # CHECK: lwx                $12, $22($gp)   # encoding: [0x7f,0x96,0x60,0x0a]
+  madd              $ac1, $6, $7    # CHECK: madd               $ac1, $6, $7    # encoding: [0x70,0xc7,0x08,0x00]
+  maddu             $ac0, $8, $9    # CHECK: maddu              $ac0, $8, $9    # encoding: [0x71,0x09,0x00,0x01]
+  madd              $6, $7          # CHECK: madd               $6, $7          # encoding: [0x70,0xc7,0x00,0x00]
+  maddu             $8, $9          # CHECK: maddu              $8, $9          # encoding: [0x71,0x09,0x00,0x01]
+  maq_s.w.phl       $ac2, $3, $4    # CHECK: maq_s.w.phl        $ac2, $3, $4    # encoding: [0x7c,0x64,0x15,0x30]
+  maq_sa.w.phl      $ac3, $5, $6    # CHECK: maq_sa.w.phl       $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1c,0x30]
+  maq_s.w.phr       $ac0, $7, $8    # CHECK: maq_s.w.phr        $ac0, $7, $8    # encoding: [0x7c,0xe8,0x05,0xb0]
+  maq_sa.w.phr      $ac1, $9, $10   # CHECK: maq_sa.w.phr       $ac1, $9, $10   # encoding: [0x7d,0x2a,0x0c,0xb0]
+  mfhi              $14, $ac1       # CHECK: mfhi               $14, $ac1       # encoding: [0x00,0x20,0x70,0x10]
+  mflo              $15, $ac0       # CHECK: mflo               $15, $ac0       # encoding: [0x00,0x00,0x78,0x12]
+  mfhi              $14             # CHECK: mfhi               $14             # encoding: [0x00,0x00,0x70,0x10]
+  mflo              $15             # CHECK: mflo               $15             # encoding: [0x00,0x00,0x78,0x12]
+  modsub            $11, $12, $13   # CHECK: modsub             $11, $12, $13   # encoding: [0x7d,0x8d,0x5c,0x90]
+  msub              $ac3, $10, $11  # CHECK: msub               $ac3, $10, $11  # encoding: [0x71,0x4b,0x18,0x04]
+  msubu             $ac2, $12, $13  # CHECK: msubu              $ac2, $12, $13  # encoding: [0x71,0x8d,0x10,0x05]
+  msub              $10, $11        # CHECK: msub               $10, $11        # encoding: [0x71,0x4b,0x00,0x04]
+  msubu             $12, $13        # CHECK: msubu              $12, $13        # encoding: [0x71,0x8d,0x00,0x05]
+  mthi              $16, $ac3       # CHECK: mthi               $16, $ac3       # encoding: [0x02,0x00,0x18,0x11]
+  mthi              $16             # CHECK: mthi               $16             # encoding: [0x02,0x00,0x00,0x11]
+  mthlip            $14, $ac2       # CHECK: mthlip             $14, $ac2       # encoding: [0x7d,0xc0,0x17,0xf8]
+  mtlo              $17, $ac2       # CHECK: mtlo               $17, $ac2       # encoding: [0x02,0x20,0x10,0x13]
+  mtlo              $17             # CHECK: mtlo               $17             # encoding: [0x02,0x20,0x00,0x13]
+  muleq_s.w.phl     $21, $22, $23   # CHECK: muleq_s.w.phl      $21, $22, $23   # encoding: [0x7e,0xd7,0xaf,0x10]
+  muleq_s.w.phr     $24, $25, $26   # CHECK: muleq_s.w.phr      $24, $25, $26   # encoding: [0x7f,0x3a,0xc7,0x50]
+  muleu_s.ph.qbl    $27, $gp, $sp   # CHECK: muleu_s.ph.qbl     $27, $gp, $sp   # encoding: [0x7f,0x9d,0xd9,0x90]
+  muleu_s.ph.qbr    $fp, $ra, $1    # CHECK: muleu_s.ph.qbr     $fp, $ra, $1    # encoding: [0x7f,0xe1,0xf1,0xd0]
+  mulq_rs.ph        $2, $3, $4      # CHECK: mulq_rs.ph         $2, $3, $4      # encoding: [0x7c,0x64,0x17,0xd0]
+  mulsaq_s.w.ph     $ac0, $16, $17  # CHECK: mulsaq_s.w.ph      $ac0, $16, $17  # encoding: [0x7e,0x11,0x01,0xb0]
+  mult              $ac3, $2, $3    # CHECK: mult               $ac3, $2, $3    # encoding: [0x00,0x43,0x18,0x18]
+  multu             $ac2, $4, $5    # CHECK: multu              $ac2, $4, $5    # encoding: [0x00,0x85,0x10,0x19]
+  mult              $2, $3          # CHECK: mult               $2, $3          # encoding: [0x00,0x43,0x00,0x18]
+  multu             $4, $5          # CHECK: multu              $4, $5          # encoding: [0x00,0x85,0x00,0x19]
+  packrl.ph         $18, $19, $20   # CHECK: packrl.ph          $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x91]
+  pick.ph           $7, $15, $3     # CHECK: pick.ph            $7, $15, $3     # encoding: [0x7d,0xe3,0x3a,0xd1]
+  pick.qb           $2, $4, $8      # CHECK: pick.qb            $2, $4, $8      # encoding: [0x7c,0x88,0x10,0xd1]
+  preceq.w.phl      $20, $21        # CHECK: preceq.w.phl       $20, $21        # encoding: [0x7c,0x15,0xa3,0x12]
+  preceq.w.phr      $21, $22        # CHECK: preceq.w.phr       $21, $22        # encoding: [0x7c,0x16,0xab,0x52]
+  precequ.ph.qbl    $22, $23        # CHECK: precequ.ph.qbl     $22, $23        # encoding: [0x7c,0x17,0xb1,0x12]
+  precequ.ph.qbla   $24, $25        # CHECK: precequ.ph.qbla    $24, $25        # encoding: [0x7c,0x19,0xc1,0x92]
+  precequ.ph.qbr    $23, $24        # CHECK: precequ.ph.qbr     $23, $24        # encoding: [0x7c,0x18,0xb9,0x52]
+  precequ.ph.qbra   $25, $26        # CHECK: precequ.ph.qbra    $25, $26        # encoding: [0x7c,0x1a,0xc9,0xd2]
+  preceu.ph.qbl     $26, $27        # CHECK: preceu.ph.qbl      $26, $27        # encoding: [0x7c,0x1b,0xd7,0x12]
+  preceu.ph.qbla    $gp, $sp        # CHECK: preceu.ph.qbla     $gp, $sp        # encoding: [0x7c,0x1d,0xe7,0x92]
+  preceu.ph.qbr     $27, $gp        # CHECK: preceu.ph.qbr      $27, $gp        # encoding: [0x7c,0x1c,0xdf,0x52]
+  preceu.ph.qbra    $sp, $fp        # CHECK: preceu.ph.qbra     $sp, $fp        # encoding: [0x7c,0x1e,0xef,0xd2]
+  precrq.ph.w       $17, $18, $19   # CHECK: precrq.ph.w        $17, $18, $19   # encoding: [0x7e,0x53,0x8d,0x11]
+  precrq.qb.ph      $16, $17, $18   # CHECK: precrq.qb.ph       $16, $17, $18   # encoding: [0x7e,0x32,0x83,0x11]
+  precrqu_s.qb.ph   $19, $20, $21   # CHECK: precrqu_s.qb.ph    $19, $20, $21   # encoding: [0x7e,0x95,0x9b,0xd1]
+  precrq_rs.ph.w    $18, $19, $20   # CHECK: precrq_rs.ph.w     $18, $19, $20   # encoding: [0x7e,0x74,0x95,0x51]
+  raddu.w.qb        $1, $2          # CHECK: raddu.w.qb         $1, $2          # encoding: [0x7c,0x40,0x0d,0x10]
+  rddsp             $5, 256         # CHECK: rddsp              $5, 256         # encoding: [0x7d,0x00,0x2c,0xb8]
+  repl.ph           $2, 12          # CHECK: repl.ph            $2, 12          # encoding: [0x7c,0x0c,0x12,0x92]
+  repl.qb           $1, 85          # CHECK: repl.qb            $1, 85          # encoding: [0x7c,0x55,0x08,0x92]
+  replv.ph          $1, $2          # CHECK: replv.ph           $1, $2          # encoding: [0x7c,0x02,0x0a,0xd2]
+  replv.qb          $1, $2          # CHECK: replv.qb           $1, $2          # encoding: [0x7c,0x02,0x08,0xd2]
+  shilo             $ac1, 16        # CHECK: shilo              $ac1, 16        # encoding: [0x7d,0x00,0x0e,0xb8]
+  shilov            $ac1, $2        # CHECK: shilov             $ac1, $2        # encoding: [0x7c,0x40,0x0e,0xf8]
+  shll.ph           $1, $2, 3       # CHECK: shll.ph            $1, $2, 3       # encoding: [0x7c,0x62,0x0a,0x13]
+  shll_s.ph         $1, $2, 3       # CHECK: shll_s.ph          $1, $2, 3       # encoding: [0x7c,0x62,0x0b,0x13]
+  shll.qb           $1, $2, 3       # CHECK: shll.qb            $1, $2, 3       # encoding: [0x7c,0x62,0x08,0x13]
+  shllv.ph          $1, $2, $3      # CHECK: shllv.ph           $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0x93]
+  shllv_s.ph        $1, $2, $3      # CHECK: shllv_s.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0x93]
+  shllv.qb          $1, $2, $3      # CHECK: shllv.qb           $1, $2, $3      # encoding: [0x7c,0x62,0x08,0x93]
+  shllv_s.w         $1, $2, $3      # CHECK: shllv_s.w          $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0x93]
+  shll_s.w          $1, $2, 3       # CHECK: shll_s.w           $1, $2, 3       # encoding: [0x7c,0x62,0x0d,0x13]
+  shra.ph           $5, $2, 1       # CHECK: shra.ph            $5, $2, 1       # encoding: [0x7c,0x22,0x2a,0x53]
+  shra_r.ph         $5, $2, 1       # CHECK: shra_r.ph          $5, $2, 1       # encoding: [0x7c,0x22,0x2b,0x53]
+  shrav.ph          $1, $2, $3      # CHECK: shrav.ph           $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0xd3]
+  shrav_r.ph        $1, $2, $3      # CHECK: shrav_r.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0xd3]
+  shrav_r.w         $1, $2, $3      # CHECK: shrav_r.w          $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0xd3]
+  shra_r.w          $1, $2, 1       # CHECK: shra_r.w           $1, $2, 1       # encoding: [0x7c,0x22,0x0d,0x53]
+  shrl.qb           $1, $2, 2       # CHECK: shrl.qb            $1, $2, 2       # encoding: [0x7c,0x42,0x08,0x53]
+  shrlv.qb          $1, $2, $3      # CHECK: shrlv.qb           $1, $2, $3      # encoding: [0x7c,0x62,0x08,0xd3]
+  subq.ph           $1, $2, $3      # CHECK: subq.ph            $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd0]
+  subq_s.ph         $1, $2, $3      # CHECK: subq_s.ph          $1, $2, $3      # encoding: [0x7c,0x43,0x0b,0xd0]
+  subq_s.w          $1, $2, $3      # CHECK: subq_s.w           $1, $2, $3      # encoding: [0x7c,0x43,0x0d,0xd0]
+  subu.qb           $1, $2, $3      # CHECK: subu.qb            $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x50]
+  subu_s.qb         $1, $2, $3      # CHECK: subu_s.qb          $1, $2, $3      # encoding: [0x7c,0x43,0x09,0x50]
+  wrdsp             $1, 0           # CHECK: wrdsp              $1, 0           # encoding: [0x7c,0x20,0x04,0xf8]
diff --git a/test/MC/Mips/dspr2/valid.s b/test/MC/Mips/dspr2/valid.s
index d86081ec464c..ce9bd7309d7f 100644
--- a/test/MC/Mips/dspr2/valid.s
+++ b/test/MC/Mips/dspr2/valid.s
@@ -1,48 +1,175 @@
 # RUN: llvm-mc -show-encoding -triple=mips-unknown-unknown -mattr=dspr2 %s | FileCheck %s
 #
 # CHECK:   .text
-  precrq.qb.ph    $16,$17,$18  # CHECK: precrq.qb.ph     $16, $17, $18 # encoding: [0x7e,0x32,0x83,0x11]
-  precrq.ph.w     $17,$18,$19  # CHECK: precrq.ph.w      $17, $18, $19 # encoding: [0x7e,0x53,0x8d,0x11]
-  precrq_rs.ph.w  $18,$19,$20  # CHECK: precrq_rs.ph.w   $18, $19, $20 # encoding: [0x7e,0x74,0x95,0x51]
-  precrqu_s.qb.ph $19,$20,$21  # CHECK: precrqu_s.qb.ph  $19, $20, $21 # encoding: [0x7e,0x95,0x9b,0xd1]
-  preceq.w.phl    $20,$21      # CHECK: preceq.w.phl     $20, $21      # encoding: [0x7c,0x15,0xa3,0x12]
-  preceq.w.phr    $21,$22      # CHECK: preceq.w.phr     $21, $22      # encoding: [0x7c,0x16,0xab,0x52]
-  precequ.ph.qbl  $22,$23      # CHECK: precequ.ph.qbl   $22, $23      # encoding: [0x7c,0x17,0xb1,0x12]
-  precequ.ph.qbr  $23,$24      # CHECK: precequ.ph.qbr   $23, $24      # encoding: [0x7c,0x18,0xb9,0x52]
-  precequ.ph.qbla $24,$25      # CHECK: precequ.ph.qbla  $24, $25      # encoding: [0x7c,0x19,0xc1,0x92]
-  precequ.ph.qbra $25,$26      # CHECK: precequ.ph.qbra  $25, $26      # encoding: [0x7c,0x1a,0xc9,0xd2]
-  preceu.ph.qbl   $26,$27      # CHECK: preceu.ph.qbl    $26, $27      # encoding: [0x7c,0x1b,0xd7,0x12]
-  preceu.ph.qbr   $27,$28      # CHECK: preceu.ph.qbr    $27, $gp      # encoding: [0x7c,0x1c,0xdf,0x52]
-  preceu.ph.qbla  $28,$29      # CHECK: preceu.ph.qbla   $gp, $sp      # encoding: [0x7c,0x1d,0xe7,0x92]
-  preceu.ph.qbra  $29,$30      # CHECK: preceu.ph.qbra   $sp, $fp      # encoding: [0x7c,0x1e,0xef,0xd2]
-  precr.qb.ph     $23,$24,$25  # CHECK: precr.qb.ph      $23, $24, $25 # encoding: [0x7f,0x19,0xbb,0x51]
-  precr_sra.ph.w  $24,$25,0    # CHECK: precr_sra.ph.w   $24, $25, 0   # encoding: [0x7f,0x38,0x07,0x91]
-  precr_sra.ph.w  $24,$25,31   # CHECK: precr_sra.ph.w   $24, $25, 31  # encoding: [0x7f,0x38,0xff,0x91]
-  precr_sra_r.ph.w  $25,$26,0  # CHECK: precr_sra_r.ph.w $25, $26, 0   # encoding: [0x7f,0x59,0x07,0xd1]
-  precr_sra_r.ph.w  $25,$26,31 # CHECK: precr_sra_r.ph.w $25, $26, 31  # encoding: [0x7f,0x59,0xff,0xd1]
-  lbux $10, $s4($26)           # CHECK: lbux $10, $20($26)             # encoding: [0x7f,0x54,0x51,0x8a]
-  lhx  $11, $s5($27)           # CHECK: lhx  $11, $21($27)             # encoding: [0x7f,0x75,0x59,0x0a]
-  lwx  $12, $s6($28)           # CHECK: lwx  $12, $22($gp)             # encoding: [0x7f,0x96,0x60,0x0a]
-  mult $ac3, $2, $3            # CHECK: mult $ac3, $2, $3              # encoding: [0x00,0x43,0x18,0x18]
-  multu $ac2, $4, $5           # CHECK: multu $ac2, $4, $5             # encoding: [0x00,0x85,0x10,0x19]
-  madd $ac1, $6, $7            # CHECK: madd $ac1, $6, $7              # encoding: [0x70,0xc7,0x08,0x00]
-  maddu $ac0, $8, $9           # CHECK: maddu $ac0, $8, $9             # encoding: [0x71,0x09,0x00,0x01]
-  msub $ac3, $10, $11          # CHECK: msub $ac3, $10, $11            # encoding: [0x71,0x4b,0x18,0x04]
-  msubu $ac2, $12, $13         # CHECK: msubu $ac2, $12, $13           # encoding: [0x71,0x8d,0x10,0x05]
-  mfhi $14, $ac1               # CHECK: mfhi $14, $ac1                 # encoding: [0x00,0x20,0x70,0x10]
-  mflo $15, $ac0               # CHECK: mflo $15, $ac0                 # encoding: [0x00,0x00,0x78,0x12]
-  mthi $16, $ac3               # CHECK: mthi $16, $ac3                 # encoding: [0x02,0x00,0x18,0x11]
-  mtlo $17, $ac2               # CHECK: mtlo $17, $ac2                 # encoding: [0x02,0x20,0x10,0x13]
-  mult $2, $3                  # CHECK: mult $2, $3                    # encoding: [0x00,0x43,0x00,0x18]
-  multu $4, $5                 # CHECK: multu $4, $5                   # encoding: [0x00,0x85,0x00,0x19]
-  madd $6, $7                  # CHECK: madd $6, $7                    # encoding: [0x70,0xc7,0x00,0x00]
-  maddu $8, $9                 # CHECK: maddu $8, $9                   # encoding: [0x71,0x09,0x00,0x01]
-  msub $10, $11                # CHECK: msub $10, $11                  # encoding: [0x71,0x4b,0x00,0x04]
-  msubu $12, $13               # CHECK: msubu $12, $13                 # encoding: [0x71,0x8d,0x00,0x05]
-  mfhi $14                     # CHECK: mfhi $14                       # encoding: [0x00,0x00,0x70,0x10]
-  mflo $15                     # CHECK: mflo $15                       # encoding: [0x00,0x00,0x78,0x12]
-  mthi $16                     # CHECK: mthi $16                       # encoding: [0x02,0x00,0x00,0x11]
-  mtlo $17                     # CHECK: mtlo $17                       # encoding: [0x02,0x20,0x00,0x13]
-  append $2, $3, 3             # CHECK: append $2, $3, 3               # encoding: [0x7c,0x62,0x18,0x31]
-  balign $4, $5, 1             # CHECK: balign $4, $5, 1               # encoding: [0x7c,0xa4,0x0c,0x31]
-  prepend $6, $7, 4            # CHECK: prepend $6, $7, 4              # encoding: [0x7c,0xe6,0x20,0x71]
+  .set noat
+  absq_s.ph       $1, $2          # CHECK: absq_s.ph        $1, $2          # encoding: [0x7c,0x02,0x0a,0x52]
+  absq_s.qb       $3, $4          # CHECK: absq_s.qb        $3, $4          # encoding: [0x7c,0x04,0x18,0x52]
+  absq_s.w        $5, $6          # CHECK: absq_s.w         $5, $6          # encoding: [0x7c,0x06,0x2c,0x52]
+  addq.ph         $7, $8, $9      # CHECK: addq.ph          $7, $8, $9      # encoding: [0x7d,0x09,0x3a,0x90]
+  addq_s.ph       $10, $11, $12   # CHECK: addq_s.ph        $10, $11, $12   # encoding: [0x7d,0x6c,0x53,0x90]
+  addq_s.w        $13, $14, $15   # CHECK: addq_s.w         $13, $14, $15   # encoding: [0x7d,0xcf,0x6d,0x90]
+  addqh.ph        $16, $17, $18   # CHECK: addqh.ph         $16, $17, $18   # encoding: [0x7e,0x32,0x82,0x18]
+  addqh_r.ph      $19, $20, $21   # CHECK: addqh_r.ph       $19, $20, $21   # encoding: [0x7e,0x95,0x9a,0x98]
+  addqh.w         $22, $23, $24   # CHECK: addqh.w          $22, $23, $24   # encoding: [0x7e,0xf8,0xb4,0x18]
+  addqh_r.w       $25, $26, $27   # CHECK: addqh_r.w        $25, $26, $27   # encoding: [0x7f,0x5b,0xcc,0x98]
+  addsc           $gp, $sp, $fp   # CHECK: addsc            $gp, $sp, $fp   # encoding: [0x7f,0xbe,0xe4,0x10]
+  addu.ph         $ra, $1, $2     # CHECK: addu.ph          $ra, $1, $2     # encoding: [0x7c,0x22,0xfa,0x10]
+  addu_s.ph       $3, $4, $5      # CHECK: addu_s.ph        $3, $4, $5      # encoding: [0x7c,0x85,0x1b,0x10]
+  addu.qb         $6, $7, $8      # CHECK: addu.qb          $6, $7, $8      # encoding: [0x7c,0xe8,0x30,0x10]
+  addu_s.qb       $9, $10, $11    # CHECK: addu_s.qb        $9, $10, $11    # encoding: [0x7d,0x4b,0x49,0x10]
+  addwc           $12, $13, $14   # CHECK: addwc            $12, $13, $14   # encoding: [0x7d,0xae,0x64,0x50]
+  adduh.qb        $15, $16, $17   # CHECK: adduh.qb         $15, $16, $17   # encoding: [0x7e,0x11,0x78,0x18]
+  adduh_r.qb      $18, $19, $20   # CHECK: adduh_r.qb       $18, $19, $20   # encoding: [0x7e,0x74,0x90,0x98]
+  append          $21, $22, 0     # CHECK: append           $21, $22, 0     # encoding: [0x7e,0xd5,0x00,0x31]
+  balign          $23, $24, 3     # CHECK: balign           $23, $24, 3     # encoding: [0x7f,0x17,0x1c,0x31]
+  bitrev          $25, $26        # CHECK: bitrev           $25, $26        # encoding: [0x7c,0x1a,0xce,0xd2]
+  bposge32        21100           # CHECK: bposge32         21100           # encoding: [0x04,0x1c,0x14,0x9b]
+  cmp.eq.ph       $27, $gp        # CHECK: cmp.eq.ph        $27, $gp        # encoding: [0x7f,0x7c,0x02,0x11]
+  cmp.lt.ph       $sp, $fp        # CHECK: cmp.lt.ph        $sp, $fp        # encoding: [0x7f,0xbe,0x02,0x51]
+  cmp.le.ph       $ra, $1         # CHECK: cmp.le.ph        $ra, $1         # encoding: [0x7f,0xe1,0x02,0x91]
+  cmpgdu.eq.qb    $2, $3, $4      # CHECK: cmpgdu.eq.qb     $2, $3, $4      # encoding: [0x7c,0x64,0x16,0x11]
+  cmpgdu.lt.qb    $5, $6, $7      # CHECK: cmpgdu.lt.qb     $5, $6, $7      # encoding: [0x7c,0xc7,0x2e,0x51]
+  cmpgdu.le.qb    $8, $9, $10     # CHECK: cmpgdu.le.qb     $8, $9, $10     # encoding: [0x7d,0x2a,0x46,0x91]
+  cmpgu.eq.qb     $11, $12, $13   # CHECK: cmpgu.eq.qb      $11, $12, $13   # encoding: [0x7d,0x8d,0x59,0x11]
+  cmpgu.lt.qb     $14, $15, $16   # CHECK: cmpgu.lt.qb      $14, $15, $16   # encoding: [0x7d,0xf0,0x71,0x51]
+  cmpgu.le.qb     $17, $18, $19   # CHECK: cmpgu.le.qb      $17, $18, $19   # encoding: [0x7e,0x53,0x89,0x91]
+  cmpu.eq.qb      $20, $21        # CHECK: cmpu.eq.qb       $20, $21        # encoding: [0x7e,0x95,0x00,0x11]
+  cmpu.lt.qb      $22, $23        # CHECK: cmpu.lt.qb       $22, $23        # encoding: [0x7e,0xd7,0x00,0x51]
+  cmpu.le.qb      $24, $25        # CHECK: cmpu.le.qb       $24, $25        # encoding: [0x7f,0x19,0x00,0x91]
+  dpa.w.ph        $ac0, $26, $27  # CHECK: dpa.w.ph         $ac0, $26, $27  # encoding: [0x7f,0x5b,0x00,0x30]
+  dpaq_s.w.ph     $ac1, $1, $2    # CHECK: dpaq_s.w.ph      $ac1, $1, $2    # encoding: [0x7c,0x22,0x09,0x30]
+  dpaq_sa.l.w     $ac2, $3, $4    # CHECK: dpaq_sa.l.w      $ac2, $3, $4    # encoding: [0x7c,0x64,0x13,0x30]
+  dpaqx_s.w.ph    $ac3, $5, $6    # CHECK: dpaqx_s.w.ph     $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1e,0x30]
+  dpaqx_sa.w.ph   $ac0, $7, $8    # CHECK: dpaqx_sa.w.ph    $ac0, $7, $8    # encoding: [0x7c,0xe8,0x06,0xb0]
+  dpau.h.qbl      $ac1, $9, $10   # CHECK: dpau.h.qbl       $ac1, $9, $10   # encoding: [0x7d,0x2a,0x08,0xf0]
+  dpau.h.qbr      $ac1, $11, $12  # CHECK: dpau.h.qbr       $ac1, $11, $12  # encoding: [0x7d,0x6c,0x09,0xf0]
+  dpax.w.ph       $ac2, $13, $14  # CHECK: dpax.w.ph        $ac2, $13, $14  # encoding: [0x7d,0xae,0x12,0x30]
+  dps.w.ph        $ac3, $15, $16  # CHECK: dps.w.ph         $ac3, $15, $16  # encoding: [0x7d,0xf0,0x18,0x70]
+  dpsq_s.w.ph     $ac0, $17, $18  # CHECK: dpsq_s.w.ph      $ac0, $17, $18  # encoding: [0x7e,0x32,0x01,0x70]
+  dpsq_sa.l.w     $ac1, $19, $20  # CHECK: dpsq_sa.l.w      $ac1, $19, $20  # encoding: [0x7e,0x74,0x0b,0x70]
+  dpsqx_s.w.ph    $ac2, $1, $2    # CHECK: dpsqx_s.w.ph     $ac2, $1, $2    # encoding: [0x7c,0x22,0x16,0x70]
+  dpsqx_sa.w.ph   $ac3, $3, $4    # CHECK: dpsqx_sa.w.ph    $ac3, $3, $4    # encoding: [0x7c,0x64,0x1e,0xf0]
+  dpsu.h.qbl      $ac0, $5, $6    # CHECK: dpsu.h.qbl       $ac0, $5, $6    # encoding: [0x7c,0xa6,0x02,0xf0]
+  dpsu.h.qbr      $ac1, $7, $8    # CHECK: dpsu.h.qbr       $ac1, $7, $8    # encoding: [0x7c,0xe8,0x0b,0xf0]
+  dpsx.w.ph       $ac2, $9, $10   # CHECK: dpsx.w.ph        $ac2, $9, $10   # encoding: [0x7d,0x2a,0x12,0x70]
+  extp            $1, $ac0, 31    # CHECK: extp             $1, $ac0, 31    # encoding: [0x7f,0xe1,0x00,0xb8]
+  extpdp          $2, $ac1, 0     # CHECK: extpdp           $2, $ac1, 0     # encoding: [0x7c,0x02,0x0a,0xb8]
+  extpdpv         $3, $ac2, $4    # CHECK: extpdpv          $3, $ac2, $4    # encoding: [0x7c,0x83,0x12,0xf8]
+  extpv           $5, $ac3, $6    # CHECK: extpv            $5, $ac3, $6    # encoding: [0x7c,0xc5,0x18,0xf8]
+  extr.w          $7, $ac0, 31    # CHECK: extr.w           $7, $ac0, 31    # encoding: [0x7f,0xe7,0x00,0x38]
+  extr_r.w        $8, $ac1, 15    # CHECK: extr_r.w         $8, $ac1, 15    # encoding: [0x7d,0xe8,0x09,0x38]
+  extr_rs.w       $9, $ac2, 7     # CHECK: extr_rs.w        $9, $ac2, 7     # encoding: [0x7c,0xe9,0x11,0xb8]
+  extr_s.h        $10, $ac3, 3    # CHECK: extr_s.h         $10, $ac3, 3    # encoding: [0x7c,0x6a,0x1b,0xb8]
+  extrv.w         $11, $ac0, $12  # CHECK: extrv.w          $11, $ac0, $12  # encoding: [0x7d,0x8b,0x00,0x78]
+  extrv_r.w       $13, $ac1, $14  # CHECK: extrv_r.w        $13, $ac1, $14  # encoding: [0x7d,0xcd,0x09,0x78]
+  extrv_rs.w      $15, $ac2, $16  # CHECK: extrv_rs.w       $15, $ac2, $16  # encoding: [0x7e,0x0f,0x11,0xf8]
+  extrv_s.h       $17, $ac3, $18  # CHECK: extrv_s.h        $17, $ac3, $18  # encoding: [0x7e,0x51,0x1b,0xf8]
+  insv            $19, $20        # CHECK: insv             $19, $20        # encoding: [0x7e,0x93,0x00,0x0c]
+  lbux            $10, $20($26)   # CHECK: lbux             $10, $20($26)   # encoding: [0x7f,0x54,0x51,0x8a]
+  lhx             $11, $21($27)   # CHECK: lhx              $11, $21($27)   # encoding: [0x7f,0x75,0x59,0x0a]
+  lwx             $12, $22($gp)   # CHECK: lwx              $12, $22($gp)   # encoding: [0x7f,0x96,0x60,0x0a]
+  madd            $ac1, $6, $7    # CHECK: madd             $ac1, $6, $7    # encoding: [0x70,0xc7,0x08,0x00]
+  maddu           $ac0, $8, $9    # CHECK: maddu            $ac0, $8, $9    # encoding: [0x71,0x09,0x00,0x01]
+  madd            $6, $7          # CHECK: madd             $6, $7          # encoding: [0x70,0xc7,0x00,0x00]
+  maddu           $8, $9          # CHECK: maddu            $8, $9          # encoding: [0x71,0x09,0x00,0x01]
+  maq_s.w.phl     $ac2, $3, $4    # CHECK: maq_s.w.phl      $ac2, $3, $4    # encoding: [0x7c,0x64,0x15,0x30]
+  maq_sa.w.phl    $ac3, $5, $6    # CHECK: maq_sa.w.phl     $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1c,0x30]
+  maq_s.w.phr     $ac0, $7, $8    # CHECK: maq_s.w.phr      $ac0, $7, $8    # encoding: [0x7c,0xe8,0x05,0xb0]
+  maq_sa.w.phr    $ac1, $9, $10   # CHECK: maq_sa.w.phr     $ac1, $9, $10   # encoding: [0x7d,0x2a,0x0c,0xb0]
+  mfhi            $14, $ac1       # CHECK: mfhi             $14, $ac1       # encoding: [0x00,0x20,0x70,0x10]
+  mflo            $15, $ac0       # CHECK: mflo             $15, $ac0       # encoding: [0x00,0x00,0x78,0x12]
+  mfhi            $14             # CHECK: mfhi             $14             # encoding: [0x00,0x00,0x70,0x10]
+  mflo            $15             # CHECK: mflo             $15             # encoding: [0x00,0x00,0x78,0x12]
+  modsub          $11, $12, $13   # CHECK: modsub           $11, $12, $13   # encoding: [0x7d,0x8d,0x5c,0x90]
+  msub            $ac3, $10, $11  # CHECK: msub             $ac3, $10, $11  # encoding: [0x71,0x4b,0x18,0x04]
+  msubu           $ac2, $12, $13  # CHECK: msubu            $ac2, $12, $13  # encoding: [0x71,0x8d,0x10,0x05]
+  msub            $10, $11        # CHECK: msub             $10, $11        # encoding: [0x71,0x4b,0x00,0x04]
+  msubu           $12, $13        # CHECK: msubu            $12, $13        # encoding: [0x71,0x8d,0x00,0x05]
+  mthi            $16, $ac3       # CHECK: mthi             $16, $ac3       # encoding: [0x02,0x00,0x18,0x11]
+  mthi            $16             # CHECK: mthi             $16             # encoding: [0x02,0x00,0x00,0x11]
+  mthlip          $14, $ac2       # CHECK: mthlip           $14, $ac2       # encoding: [0x7d,0xc0,0x17,0xf8]
+  mtlo            $17, $ac2       # CHECK: mtlo             $17, $ac2       # encoding: [0x02,0x20,0x10,0x13]
+  mtlo            $17             # CHECK: mtlo             $17             # encoding: [0x02,0x20,0x00,0x13]
+  mul.ph          $15, $16, $17   # CHECK: mul.ph           $15, $16, $17   # encoding: [0x7e,0x11,0x7b,0x18]
+  mul_s.ph        $18, $19, $20   # CHECK: mul_s.ph         $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x98]
+  muleq_s.w.phl   $21, $22, $23   # CHECK: muleq_s.w.phl    $21, $22, $23   # encoding: [0x7e,0xd7,0xaf,0x10]
+  muleq_s.w.phr   $24, $25, $26   # CHECK: muleq_s.w.phr    $24, $25, $26   # encoding: [0x7f,0x3a,0xc7,0x50]
+  muleu_s.ph.qbl  $27, $gp, $sp   # CHECK: muleu_s.ph.qbl   $27, $gp, $sp   # encoding: [0x7f,0x9d,0xd9,0x90]
+  muleu_s.ph.qbr  $fp, $ra, $1    # CHECK: muleu_s.ph.qbr   $fp, $ra, $1    # encoding: [0x7f,0xe1,0xf1,0xd0]
+  mulq_rs.ph      $2, $3, $4      # CHECK: mulq_rs.ph       $2, $3, $4      # encoding: [0x7c,0x64,0x17,0xd0]
+  mulq_rs.w       $5, $6, $7      # CHECK: mulq_rs.w        $5, $6, $7      # encoding: [0x7c,0xc7,0x2d,0xd8]
+  mulq_s.ph       $8, $9, $10     # CHECK: mulq_s.ph        $8, $9, $10     # encoding: [0x7d,0x2a,0x47,0x90]
+  mulq_s.w        $11, $12, $13   # CHECK: mulq_s.w         $11, $12, $13   # encoding: [0x7d,0x8d,0x5d,0x98]
+  mulsa.w.ph      $ac3, $14, $15  # CHECK: mulsa.w.ph       $ac3, $14, $15  # encoding: [0x7d,0xcf,0x18,0xb0]
+  mulsaq_s.w.ph   $ac0, $16, $17  # CHECK: mulsaq_s.w.ph    $ac0, $16, $17  # encoding: [0x7e,0x11,0x01,0xb0]
+  mult            $ac3, $2, $3    # CHECK: mult             $ac3, $2, $3    # encoding: [0x00,0x43,0x18,0x18]
+  multu           $ac2, $4, $5    # CHECK: multu            $ac2, $4, $5    # encoding: [0x00,0x85,0x10,0x19]
+  mult            $2, $3          # CHECK: mult             $2, $3          # encoding: [0x00,0x43,0x00,0x18]
+  multu           $4, $5          # CHECK: multu            $4, $5          # encoding: [0x00,0x85,0x00,0x19]
+  packrl.ph       $18, $19, $20   # CHECK: packrl.ph        $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x91]
+  pick.ph         $7, $15, $3     # CHECK: pick.ph          $7, $15, $3     # encoding: [0x7d,0xe3,0x3a,0xd1]
+  pick.qb         $2, $4, $8      # CHECK: pick.qb          $2, $4, $8      # encoding: [0x7c,0x88,0x10,0xd1]
+  preceq.w.phl    $20,$21         # CHECK: preceq.w.phl     $20, $21        # encoding: [0x7c,0x15,0xa3,0x12]
+  preceq.w.phr    $21,$22         # CHECK: preceq.w.phr     $21, $22        # encoding: [0x7c,0x16,0xab,0x52]
+  precequ.ph.qbl  $22,$23         # CHECK: precequ.ph.qbl   $22, $23        # encoding: [0x7c,0x17,0xb1,0x12]
+  precequ.ph.qbla $24,$25         # CHECK: precequ.ph.qbla  $24, $25        # encoding: [0x7c,0x19,0xc1,0x92]
+  precequ.ph.qbr  $23,$24         # CHECK: precequ.ph.qbr   $23, $24        # encoding: [0x7c,0x18,0xb9,0x52]
+  precequ.ph.qbra $25,$26         # CHECK: precequ.ph.qbra  $25, $26        # encoding: [0x7c,0x1a,0xc9,0xd2]
+  preceu.ph.qbl   $26,$27         # CHECK: preceu.ph.qbl    $26, $27        # encoding: [0x7c,0x1b,0xd7,0x12]
+  preceu.ph.qbla  $28,$29         # CHECK: preceu.ph.qbla   $gp, $sp        # encoding: [0x7c,0x1d,0xe7,0x92]
+  preceu.ph.qbr   $27,$28         # CHECK: preceu.ph.qbr    $27, $gp        # encoding: [0x7c,0x1c,0xdf,0x52]
+  preceu.ph.qbra  $29,$30         # CHECK: preceu.ph.qbra   $sp, $fp        # encoding: [0x7c,0x1e,0xef,0xd2]
+  precr.qb.ph     $23,$24,$25     # CHECK: precr.qb.ph      $23, $24, $25   # encoding: [0x7f,0x19,0xbb,0x51]
+  precr_sra.ph.w  $24,$25,0       # CHECK: precr_sra.ph.w   $24, $25, 0     # encoding: [0x7f,0x38,0x07,0x91]
+  precr_sra.ph.w  $24,$25,31      # CHECK: precr_sra.ph.w   $24, $25, 31    # encoding: [0x7f,0x38,0xff,0x91]
+  precr_sra_r.ph.w  $25,$26,0     # CHECK: precr_sra_r.ph.w $25, $26, 0     # encoding: [0x7f,0x59,0x07,0xd1]
+  precr_sra_r.ph.w  $25,$26,31    # CHECK: precr_sra_r.ph.w $25, $26, 31    # encoding: [0x7f,0x59,0xff,0xd1]
+  precrq.ph.w     $17,$18,$19     # CHECK: precrq.ph.w      $17, $18, $19   # encoding: [0x7e,0x53,0x8d,0x11]
+  precrq.qb.ph    $16,$17,$18     # CHECK: precrq.qb.ph     $16, $17, $18   # encoding: [0x7e,0x32,0x83,0x11]
+  precrqu_s.qb.ph $19,$20,$21     # CHECK: precrqu_s.qb.ph  $19, $20, $21   # encoding: [0x7e,0x95,0x9b,0xd1]
+  precrq_rs.ph.w  $18,$19,$20     # CHECK: precrq_rs.ph.w   $18, $19, $20   # encoding: [0x7e,0x74,0x95,0x51]
+  prepend         $1, $2, 3       # CHECK: prepend          $1, $2, 3       # encoding: [0x7c,0x41,0x18,0x71]
+  raddu.w.qb      $1, $2          # CHECK: raddu.w.qb       $1, $2          # encoding: [0x7c,0x40,0x0d,0x10]
+  rddsp           $5, 256         # CHECK: rddsp            $5, 256         # encoding: [0x7d,0x00,0x2c,0xb8]
+  repl.ph         $2, 12          # CHECK: repl.ph          $2, 12          # encoding: [0x7c,0x0c,0x12,0x92]
+  repl.qb         $1, 85          # CHECK: repl.qb          $1, 85          # encoding: [0x7c,0x55,0x08,0x92]
+  replv.ph        $1, $2          # CHECK: replv.ph         $1, $2          # encoding: [0x7c,0x02,0x0a,0xd2]
+  replv.qb        $1, $2          # CHECK: replv.qb         $1, $2          # encoding: [0x7c,0x02,0x08,0xd2]
+  shilo           $ac1, 16        # CHECK: shilo            $ac1, 16        # encoding: [0x7d,0x00,0x0e,0xb8]
+  shilov          $ac1, $2        # CHECK: shilov           $ac1, $2        # encoding: [0x7c,0x40,0x0e,0xf8]
+  shll.ph         $1, $2, 3       # CHECK: shll.ph          $1, $2, 3       # encoding: [0x7c,0x62,0x0a,0x13]
+  shll_s.ph       $1, $2, 3       # CHECK: shll_s.ph        $1, $2, 3       # encoding: [0x7c,0x62,0x0b,0x13]
+  shll.qb         $1, $2, 3       # CHECK: shll.qb          $1, $2, 3       # encoding: [0x7c,0x62,0x08,0x13]
+  shllv.ph        $1, $2, $3      # CHECK: shllv.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0x93]
+  shllv_s.ph      $1, $2, $3      # CHECK: shllv_s.ph       $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0x93]
+  shllv.qb        $1, $2, $3      # CHECK: shllv.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x08,0x93]
+  shllv_s.w       $1, $2, $3      # CHECK: shllv_s.w        $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0x93]
+  shll_s.w        $1, $2, 3       # CHECK: shll_s.w         $1, $2, 3       # encoding: [0x7c,0x62,0x0d,0x13]
+  shra.qb         $2, $16, 2      # CHECK: shra.qb          $2, $16, 2      # encoding: [0x7c,0x50,0x11,0x13]
+  shra_r.qb       $2, $16, 2      # CHECK: shra_r.qb        $2, $16, 2      # encoding: [0x7c,0x50,0x11,0x53]
+  shra.ph         $5, $2, 1       # CHECK: shra.ph          $5, $2, 1       # encoding: [0x7c,0x22,0x2a,0x53]
+  shra_r.ph       $5, $2, 1       # CHECK: shra_r.ph        $5, $2, 1       # encoding: [0x7c,0x22,0x2b,0x53]
+  shrav.ph        $1, $2, $3      # CHECK: shrav.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0xd3]
+  shrav_r.ph      $1, $2, $3      # CHECK: shrav_r.ph       $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0xd3]
+  shrav.qb        $1, $2, $3      # CHECK: shrav.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x09,0x93]
+  shrav_r.qb      $1, $2, $3      # CHECK: shrav_r.qb       $1, $2, $3      # encoding: [0x7c,0x62,0x09,0xd3]
+  shrav_r.w       $1, $2, $3      # CHECK: shrav_r.w        $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0xd3]
+  shra_r.w        $1, $2, 1       # CHECK: shra_r.w         $1, $2, 1       # encoding: [0x7c,0x22,0x0d,0x53]
+  shrl.ph         $1, $2, 2       # CHECK: shrl.ph          $1, $2, 2       # encoding: [0x7c,0x42,0x0e,0x53]
+  shrl.qb         $1, $2, 2       # CHECK: shrl.qb          $1, $2, 2       # encoding: [0x7c,0x42,0x08,0x53]
+  shrlv.ph        $1, $2, $3      # CHECK: shrlv.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0e,0xd3]
+  shrlv.qb        $1, $2, $3      # CHECK: shrlv.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x08,0xd3]
+  subq.ph         $1, $2, $3      # CHECK: subq.ph          $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd0]
+  subq_s.ph       $1, $2, $3      # CHECK: subq_s.ph        $1, $2, $3      # encoding: [0x7c,0x43,0x0b,0xd0]
+  subq_s.w        $1, $2, $3      # CHECK: subq_s.w         $1, $2, $3      # encoding: [0x7c,0x43,0x0d,0xd0]
+  subqh.ph        $1, $2, $3      # CHECK: subqh.ph         $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0x58]
+  subqh_r.ph      $1, $2, $3      # CHECK: subqh_r.ph       $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd8]
+  subqh.w         $1, $2, $3      # CHECK: subqh.w          $1, $2, $3      # encoding: [0x7c,0x43,0x0c,0x58]
+  subqh_r.w       $1, $2, $3      # CHECK: subqh_r.w        $1, $2, $3      # encoding: [0x7c,0x43,0x0c,0xd8]
+  subu.ph         $6, $2, $9      # CHECK: subu.ph          $6, $2, $9      # encoding: [0x7c,0x49,0x32,0x50]
+  subu_s.ph       $2, $3, $4      # CHECK: subu_s.ph        $2, $3, $4      # encoding: [0x7c,0x64,0x13,0x50]
+  subu.qb         $1, $2, $3      # CHECK: subu.qb          $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x50]
+  subu_s.qb       $1, $2, $3      # CHECK: subu_s.qb        $1, $2, $3      # encoding: [0x7c,0x43,0x09,0x50]
+  subuh.qb        $1, $2, $3      # CHECK: subuh.qb         $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x58]
+  subuh_r.qb      $1, $2, $3      # CHECK: subuh_r.qb       $1, $2, $3      # encoding: [0x7c,0x43,0x08,0xd8]
+  wrdsp           $1, 0           # CHECK: wrdsp            $1, 0           # encoding: [0x7c,0x20,0x04,0xf8]
diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
index ed0ab1bdc233..60c86987b093 100644
--- a/test/MC/Mips/micromips-invalid.s
+++ b/test/MC/Mips/micromips-invalid.s
@@ -75,11 +75,6 @@
   movep   $8, $6, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   movep   $5, $6, $5, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   movep   $5, $6, $2, $9  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  wait 1024         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   prefx -1, $8($5)  # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   prefx 32, $8($5)  # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   jraddiusp 1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
diff --git a/test/MC/Mips/micromips/invalid.s b/test/MC/Mips/micromips/invalid.s
index ee5aafc28c97..2bb7886eb2ae 100644
--- a/test/MC/Mips/micromips/invalid.s
+++ b/test/MC/Mips/micromips/invalid.s
@@ -1,6 +1,12 @@
 # RUN: not llvm-mc %s -triple=mips -show-encoding -mattr=micromips 2>%t1
 # RUN: FileCheck %s < %t1
 
+  break -1            # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024          # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break -1, 5         # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024, 5       # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 7, -1         # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
+  break 7, 1024       # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
   break16 -1          # CHECK: :[[@LINE]]:11: error: expected 4-bit unsigned immediate
   break16 16          # CHECK: :[[@LINE]]:11: error: expected 4-bit unsigned immediate
   cache -1, 255($7)   # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/micromips32r6/invalid.s b/test/MC/Mips/micromips32r6/invalid.s
index 14259eadaeac..35f698397ba2 100644
--- a/test/MC/Mips/micromips32r6/invalid.s
+++ b/test/MC/Mips/micromips32r6/invalid.s
@@ -16,8 +16,12 @@
   bnezc16 $9, 20           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   bnezc16 $6, 31           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch to misaligned address
   bnezc16 $6, 130          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch target out of range
-  break 1024               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1023, 1024         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  break -1                 # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024               # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break -1, 5              # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024, 5            # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 7, -1              # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
+  break 7, 1024            # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
   cache -1, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   cache 32, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   ext $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
@@ -68,6 +72,8 @@
   tlt $8, $9, $2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   tltu $8, $9, $2          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   tne $8, $9, $2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  wait -1                  # CHECK: :[[@LINE]]:8: error: expected 10-bit unsigned immediate
+  wait 1024                # CHECK: :[[@LINE]]:8: error: expected 10-bit unsigned immediate
   wrpgpr $34, $4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   wrpgpr $3, $33           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   wsbh $34, $4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips64r6/invalid.s b/test/MC/Mips/micromips64r6/invalid.s
index 51b9f7530485..67d7d8b23500 100644
--- a/test/MC/Mips/micromips64r6/invalid.s
+++ b/test/MC/Mips/micromips64r6/invalid.s
@@ -18,6 +18,20 @@
   bnezc16 $6, 130          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch target out of range
   cache -1, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   cache 32, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
+  # FIXME: Check size on dext*
+  dext $2, $3, -1, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dext $2, $3, 64, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dextm $2, $3, -1, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dextm $2, $3, 32, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dextu $2, $3, 31, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  dextu $2, $3, 64, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  # FIXME: Check size on dins*
+  dins $2, $3, -1, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dins $2, $3, 64, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dinsm $2, $3, -1, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dinsm $2, $3, 32, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dinsu $2, $3, 31, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  dinsu $2, $3, 64, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
   ext $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
   ext $2, $3, 32, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
   ins $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 1c8781b6e966..441d9693ad63 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -19,7 +19,7 @@ a:
         dati $3, 4               # CHECK: dati $3, 4          # encoding: [0x42,0x03,0x00,0x04]
         dext $9, $6, 3, 7        # CHECK: dext $9, $6, 3, 7   # encoding: [0x59,0x26,0x30,0xec]
         dextm $9, $6, 3, 7       # CHECK: dextm $9, $6, 3, 7  # encoding: [0x59,0x26,0x30,0xe4]
-        dextu $9, $6, 3, 7       # CHECK: dextu $9, $6, 3, 7  # encoding: [0x59,0x26,0x30,0xd4]
+        dextu $9, $6, 35, 7      # CHECK: dextu $9, $6, 35, 7  # encoding: [0x59,0x26,0x30,0xd4]
         dalign $4, $2, $3, 5     # CHECK: dalign $4, $2, $3, 5  # encoding: [0x58,0x43,0x25,0x1c]
         lw $3, 32($gp)           # CHECK: lw $3, 32($gp)        # encoding: [0x65,0x88]
         lw $3, 24($sp)           # CHECK: lw $3, 24($sp)        # encoding: [0x48,0x66]
diff --git a/test/MC/Mips/mips32r2/invalid-dsp.s b/test/MC/Mips/mips32r2/invalid-dsp.s
new file mode 100644
index 000000000000..66e5f63129ac
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-dsp.s
@@ -0,0 +1,97 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        absq_s.ph       $8,$a0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.w        $s3,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq.ph         $s1,$15,$at   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.ph       $s3,$s6,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.w        $a2,$8,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addsc           $s8,$15,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.qb         $s6,$v1,$v1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.qb       $s4,$s8,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addwc           $k0,$s6,$s7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bitrev          $14,$at       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.eq.ph       $s7,$14       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.le.ph       $8,$14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.lt.ph       $k0,$sp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.eq.qb     $14,$s6,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.le.qb     $9,$a3,$s4    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.lt.qb     $sp,$at,$8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.eq.qb      $v0,$24       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.le.qb      $s1,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.lt.qb      $at,$a3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaq_sa.l.w     $ac0,$a2,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbl      $ac1,$10,$24  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbr      $ac1,$s7,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_s.w.ph     $ac0,$gp,$k0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_sa.l.w     $ac0,$a3,$15  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbl      $ac2,$14,$10  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbr      $ac2,$a1,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpdpv         $s6,$ac0,$s8  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpv           $13,$ac0,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv.w         $8,$ac3,$at   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_r.w       $8,$ac1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_rs.w      $gp,$ac1,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_s.h       $s2,$ac1,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        insv            $s2,$at       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lbux            $9,$14($v0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lhx             $sp,$k0($15)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwx             $12,$12($s4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd            $ac2,$sp,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu           $ac2,$a1,$24  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phl     $ac2,$25,$11  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phr     $ac0,$10,$25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phl    $ac3,$a1,$v1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phr    $ac1,$at,$10  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi            $9,$ac2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo            $9,$ac2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        modsub          $a3,$12,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi            $v0,$ac1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthlip          $a3,$ac0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo            $v0,$ac1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phl   $11,$s4,$s4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phr   $s6,$a0,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbl  $a2,$14,$8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbr  $a1,$ra,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.ph      $s2,$14,$15   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsaq_s.w.ph   $ac0,$ra,$s2  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult            $ac1, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu           $ac1, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        packrl.ph       $ra,$24,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.ph         $ra,$a2,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.qb         $11,$a0,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbl  $s7,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbla $a0,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbr  $ra,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbra $24,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbl   $sp,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbla  $s6,$11       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbr   $gp,$s1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbra  $k1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.ph.w     $14,$s8,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.qb.ph    $a2,$12,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq_rs.ph.w  $a1,$k0,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrqu_s.qb.ph $zero,$gp,$s5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        raddu.w.qb      $25,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        repl.ph         $at,-307      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.ph        $v1,$s7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.qb        $25,$12       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilo           $ac1,26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilov          $ac2,$10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.ph        $10,$s0,$s0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.qb        $gp,$v1,$zero # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.ph      $k1,$at,$13   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.w       $s1,$ra,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.ph        $25,$s2,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.ph      $s3,$11,$25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.w       $s7,$s4,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.qb        $a2,$s2,$11   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq.ph         $ra,$9,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.ph       $13,$s8,$s5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.w        $k1,$a2,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.qb         $s6,$a2,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.qb       $s1,$at,$ra   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/invalid-dspr2.s b/test/MC/Mips/mips32r2/invalid-dspr2.s
new file mode 100644
index 000000000000..5c31b465ca1f
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-dspr2.s
@@ -0,0 +1,134 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        absq_s.ph       $8,$a0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.qb       $15,$s1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.w        $s3,$ra        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq.ph         $s1,$15,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.ph       $s3,$s6,$s2    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.w        $a2,$8,$at     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addqh.ph        $s4,$14,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addqh_r.ph      $sp,$25,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addsc           $s8,$15,$12    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.ph         $a2,$14,$s3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.qb         $s6,$v1,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.ph       $a3,$s3,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.qb       $s4,$s8,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        adduh.qb        $a1,$a1,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        adduh_r.qb      $a0,$9,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addwc           $k0,$s6,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bitrev          $14,$at        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.eq.ph       $s7,$14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.le.ph       $8,$14         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.lt.ph       $k0,$sp        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.eq.qb    $s3,$zero,$k0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.le.qb    $v1,$15,$s2    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.lt.qb    $s0,$gp,$sp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.eq.qb     $14,$s6,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.le.qb     $9,$a3,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.lt.qb     $sp,$at,$8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.eq.qb      $v0,$24        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.le.qb      $s1,$a1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.lt.qb      $at,$a3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpa.w.ph        $ac1,$s7,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaq_sa.l.w     $ac0,$a2,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaqx_s.w.ph    $ac3,$a0,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaqx_sa.w.ph   $ac1,$zero,$s5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbl      $ac1,$10,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbr      $ac1,$s7,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpax.w.ph       $ac3,$a0,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dps.w.ph        $ac1,$a3,$a1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_s.w.ph     $ac0,$gp,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_sa.l.w     $ac0,$a3,$15   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsqx_s.w.ph    $ac3,$13,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsqx_sa.w.ph   $ac3,$sp,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbl      $ac2,$14,$10   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbr      $ac2,$a1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsx.w.ph       $ac0,$s7,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpdpv         $s6,$ac0,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpv           $13,$ac0,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv.w         $8,$ac3,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_r.w       $8,$ac1,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_rs.w      $gp,$ac1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_s.h       $s2,$ac1,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        insv            $s2,$at        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lbux            $9,$14($v0)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lhx             $sp,$k0($15)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwx             $12,$12($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd            $ac2,$sp,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu           $ac2,$a1,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phl     $ac2,$25,$11   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phr     $ac0,$10,$25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phl    $ac3,$a1,$v1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phr    $ac1,$at,$10   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi            $9,$ac2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo            $9,$ac2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        modsub          $a3,$12,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi            $v0,$ac1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthlip          $a3,$ac0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo            $v0,$ac1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul.ph          $10,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul.ph          $s4,$24,$s0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul_s.ph        $10,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phl   $11,$s4,$s4    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phr   $s6,$a0,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbl  $a2,$14,$8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbr  $a1,$ra,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.ph      $s2,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.w       $at,$s4,$25    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_s.ph       $s0,$k1,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_s.w        $9,$a3,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsa.w.ph      $ac1,$s4,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsaq_s.w.ph   $ac0,$ra,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult            $ac1, $2, $3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu           $ac1, $2, $3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        packrl.ph       $ra,$24,$14    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.ph         $ra,$a2,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.qb         $11,$a0,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbl  $s7,$ra        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbla $a0,$9         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbr  $ra,$s3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbra $24,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbl   $sp,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbla  $s6,$11        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbr   $gp,$s1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbra  $k1,$s0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precr.qb.ph     $v0,$12,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.ph.w     $14,$s8,$24    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.qb.ph    $a2,$12,$12    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq_rs.ph.w  $a1,$k0,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrqu_s.qb.ph $zero,$gp,$s5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        raddu.w.qb      $25,$s3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        repl.ph         $at,-307       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.ph        $v1,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.qb        $25,$12        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilo           $ac1,26        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilov          $ac2,$10       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.ph        $10,$s0,$s0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.qb        $gp,$v1,$zero  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.ph      $k1,$at,$13    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.w       $s1,$ra,$k0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.ph        $25,$s2,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.qb        $zero,$24,$11  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.ph      $s3,$11,$25    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.qb      $a0,$sp,$s5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.w       $s7,$s4,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.ph        $14,$10,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.qb        $a2,$s2,$11    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq.ph         $ra,$9,$s8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.ph       $13,$s8,$s5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.w        $k1,$a2,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh.ph        $10,$at,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh.w         $v0,$a2,$zero  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh_r.ph      $a0,$12,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh_r.w       $10,$a2,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.ph         $9,$s6,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.qb         $s6,$a2,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.ph       $v1,$a1,$s3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.qb       $s1,$at,$ra    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subuh.qb        $zero,$gp,$gp  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subuh_r.qb      $s4,$s8,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/invalid.s b/test/MC/Mips/mips32r2/invalid.s
index ab8aebd351a3..7d4349c2f687 100644
--- a/test/MC/Mips/mips32r2/invalid.s
+++ b/test/MC/Mips/mips32r2/invalid.s
@@ -8,7 +8,21 @@
         .set noreorder
         cache -1, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
         cache 32, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ext
+        ext $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ext $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ins
+        ins $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ins $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
         jalr.hb $31          # CHECK: :[[@LINE]]:9: error: source and destination must be different
         jalr.hb $31, $31     # CHECK: :[[@LINE]]:9: error: source and destination must be different
         pref -1, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
         pref 32, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
+        sll $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sll $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        rotr $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
+        rotr $2, $3, 32      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index 658f172aec3d..5a61eb6cbfb7 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s
index 09e19e8bb3b6..defa388c4df2 100644
--- a/test/MC/Mips/mips32r3/valid-xfail.s
+++ b/test/MC/Mips/mips32r3/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s
index 30fc4b98e056..c1bf7a4b3a29 100644
--- a/test/MC/Mips/mips32r5/valid-xfail.s
+++ b/test/MC/Mips/mips32r5/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
index 452cd3a5ee62..ace04085cb7e 100644
--- a/test/MC/Mips/mips32r6/invalid.s
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -15,10 +15,12 @@ local_label:
         ldc2    $8,-21181($at)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdc2    $20,23157($s2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         swc2    $25,24880($s0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        break -1          # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024        # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break -1, 5       # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024, 5     # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 7, -1       # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
+        break 7, 1024     # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r2/invalid.s b/test/MC/Mips/mips64r2/invalid.s
index 58eb811c9460..edb2d898c1c2 100644
--- a/test/MC/Mips/mips64r2/invalid.s
+++ b/test/MC/Mips/mips64r2/invalid.s
@@ -8,9 +8,51 @@
         .set noreorder
         cache -1, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
         cache 32, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on dext*
+        dext $2, $3, -1, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dext $2, $3, 64, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dextm $2, $3, -1, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dextm $2, $3, 32, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dextu $2, $3, 31, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        dextu $2, $3, 64, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        # FIXME: Check size on dins*
+        dins $2, $3, -1, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dins $2, $3, 64, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dinsm $2, $3, -1, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dinsm $2, $3, 32, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dinsu $2, $3, 31, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        dinsu $2, $3, 64, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        drotr $2, $3, -1     # CHECK: :[[@LINE]]:23: error: expected 6-bit unsigned immediate
+        drotr $2, $3, 64     # CHECK: :[[@LINE]]:23: error: expected 6-bit unsigned immediate
         drotr32 $2, $3, -1   # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
         drotr32 $2, $3, 32   # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+        dsll $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsll $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsll32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsll32 $2, $3, 32    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsrl $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsrl $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsrl32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsrl32 $2, $3, 64    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsra $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsra $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsra32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsra32 $2, $3, 64    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ext
+        ext $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ext $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ins
+        ins $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ins $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
         jalr.hb $31          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: source and destination must be different
         jalr.hb $31, $31     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: source and destination must be different
         pref -1, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
         pref 32, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
+        sll $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sll $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        rotr $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
+        rotr $2, $3, 32      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 5faa29d6468e..bf17b35c446c 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -8,30 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
-        alnv.ob         $v22,$v19,$v30,$v1
-        alnv.ob         $v31,$v23,$v30,$at
-        alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +57,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +64,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +87,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv
diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s
index dcf66bf97d68..7e94200dfd62 100644
--- a/test/MC/Mips/mips64r3/valid-xfail.s
+++ b/test/MC/Mips/mips64r3/valid-xfail.s
@@ -8,30 +8,13 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ob         $v22,$v19,$v30,$v1
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +60,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +67,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +90,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv
diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s
index 0f7788359cf2..b5ecdcbfb726 100644
--- a/test/MC/Mips/mips64r5/valid-xfail.s
+++ b/test/MC/Mips/mips64r5/valid-xfail.s
@@ -8,30 +8,13 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ob         $v22,$v19,$v30,$v1
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +60,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +67,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +90,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
index 8d68b51c5111..373ad94ad2a5 100644
--- a/test/MC/Mips/mips64r6/invalid.s
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -13,10 +13,12 @@ local_label:
         jalr.hb $31 # CHECK: :[[@LINE]]:9: error: source and destination must be different
         jalr.hb $31, $31 # CHECK: :[[@LINE]]:9: error: source and destination must be different
         ldc2    $8,-21181($at)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        break -1          # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024        # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break -1, 5       # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024, 5     # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 7, -1       # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
+        break 7, 1024     # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/msa/invalid-64.s b/test/MC/Mips/msa/invalid-64.s
index bfa5dad77749..a15ee270bccf 100644
--- a/test/MC/Mips/msa/invalid-64.s
+++ b/test/MC/Mips/msa/invalid-64.s
@@ -7,6 +7,14 @@
     .set noat
     dlsa    $2, $3, $4, 0   # CHECK: :[[@LINE]]:25: error: expected immediate in range 1 .. 4
     dlsa    $2, $3, $4, 5   # CHECK: :[[@LINE]]:25: error: expected immediate in range 1 .. 4
+    insve.b $w25[-1], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.b $w25[16], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.h $w24[-1], $w2[0] # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.h $w24[8], $w2[0]  # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.w $w0[-1], $w13[0] # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.w $w0[4], $w13[0]  # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.d $w3[-1], $w18[0] # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
+    insve.d $w3[2], $w18[0]  # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
     insve.b $w25[3], $w9[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.h $w24[2], $w2[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.w $w0[2], $w13[1] # CHECK: :[[@LINE]]:26: error: expected '0'
@@ -17,10 +25,21 @@
     sat_s.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_s.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_s.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
+    sat_s.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_s.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.b $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
+    sat_u.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    shf.b $w19, $w30, -1    # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate
@@ -31,5 +50,17 @@
     sldi.w $w20, $w27[4]    # CHECK: :[[@LINE]]:23: error: expected 2-bit unsigned immediate
     srari.b $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srari.b $w5, $w25, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srari.h $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.h $w5, $w25, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.w $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.w $w5, $w25, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.d $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srari.d $w5, $w25, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
     srlri.b $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srlri.b $w18, $w3, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srlri.h $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.h $w18, $w3, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.w $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.w $w18, $w3, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.d $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srlri.d $w18, $w3, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
diff --git a/test/MC/Mips/msa/invalid.s b/test/MC/Mips/msa/invalid.s
index a57df3182df0..724d9c193e0a 100644
--- a/test/MC/Mips/msa/invalid.s
+++ b/test/MC/Mips/msa/invalid.s
@@ -5,6 +5,14 @@
 # RUN: FileCheck %s < %t1
 
     .set noat
+    insve.b $w25[-1], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.b $w25[16], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.h $w24[-1], $w2[0] # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.h $w24[8], $w2[0]  # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.w $w0[-1], $w13[0] # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.w $w0[4], $w13[0]  # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.d $w3[-1], $w18[0] # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
+    insve.d $w3[2], $w18[0]  # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
     insve.b $w25[3], $w9[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.h $w24[2], $w2[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.w $w0[2], $w13[1] # CHECK: :[[@LINE]]:26: error: expected '0'
@@ -17,12 +25,22 @@
     sat_s.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_s.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_s.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_s.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.b $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    shf.b $w19, $w30, -1    # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.b $w19, $w30, 256   # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, 256    # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, 256    # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate
@@ -33,5 +51,17 @@
     sldi.w $w20, $w27[4]    # CHECK: :[[@LINE]]:23: error: expected 2-bit unsigned immediate
     srari.b $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srari.b $w5, $w25, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srari.h $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.h $w5, $w25, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.w $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.w $w5, $w25, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.d $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srari.d $w5, $w25, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
     srlri.b $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srlri.b $w18, $w3, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srlri.h $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.h $w18, $w3, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.w $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.w $w18, $w3, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.d $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srlri.d $w18, $w3, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 2043100bf3e6..d8806effb0e3 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -19220,3 +19220,58 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK: vucomiss -516(%rdx), %xmm22
 // CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0xfc,0xfd,0xff,0xff]
           vucomiss -516(%rdx), %xmm22
+// CHECK: vmovsd (%rcx), %xmm25 {%k3}
+// CHECK:  encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+          vmovsd (%rcx), %xmm25 {%k3}
+
+// CHECK: vmovsd (%rcx), %xmm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+          vmovsd (%rcx), %xmm25 {%k3} {z}
+
+// CHECK: vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xe7,0x8b,0x10,0xdb]
+          vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4}
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+          vmovss (%rcx), %xmm2 {%k4}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4} {z}
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+          vmovss (%rcx), %xmm2 {%k4} {z}
+
+// CHECK: vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0x36,0x8c,0x10,0xe2]
+          vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+
+// CHECK: vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xc1,0xcf,0x87,0x10,0xef]
+          vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+
+// CHECK: vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+// CHECK:  encoding: [0x62,0xd1,0x97,0x8d,0x10,0xd8]
+          vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+
+// CHECK: vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+// CHECK:  encoding: [0x62,0xe1,0x26,0x82,0x10,0xca]
+          vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+
+// CHECK: vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+// CHECK:  encoding: [0x62,0x31,0x66,0x83,0x10,0xd7]
+          vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+
+// CHECK: vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+// CHECK:  encoding: [0x62,0xf1,0x87,0x8e,0x10,0xe4]
+          vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+
+// CHECK: vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+// CHECK:  encoding: [0x62,0xc1,0xef,0x8f,0x10,0xe6]
+          vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+
+// CHECK: vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x26,0x8b,0x10,0xeb]
+          vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+
+// CHECK: vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
+// CHECK:  encoding: [0x62,0x11,0x26,0x82,0x10,0xf8]
+          vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
diff --git a/test/MC/X86/intel-syntax-ambiguous.s b/test/MC/X86/intel-syntax-ambiguous.s
index fe1fe5023902..e90cca820043 100644
--- a/test/MC/X86/intel-syntax-ambiguous.s
+++ b/test/MC/X86/intel-syntax-ambiguous.s
@@ -45,3 +45,15 @@ add rax, 3
 
 fadd   "?half@?0??bar@@YAXXZ@4NA"
 // CHECK: error: ambiguous operand size for instruction 'fadd'
+
+// Instruction line with PTR inside check that they don't accept register as memory.
+
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: andps xmm1, xmmword ptr xmm1
+andps xmm1, xmmword ptr xmm1
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: andps xmmword ptr xmm1, xmm1
+andps xmmword ptr xmm1, xmm1
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: mov dword ptr eax, ebx
+mov dword ptr eax, ebx
diff --git a/test/MC/X86/intel-syntax-avx512.s b/test/MC/X86/intel-syntax-avx512.s
index 6340f853b553..c5ab7dde1106 100644
--- a/test/MC/X86/intel-syntax-avx512.s
+++ b/test/MC/X86/intel-syntax-avx512.s
@@ -264,3 +264,91 @@ vaddpd zmm1,zmm1,zmm2,{rz-sae}
 // CHECK:  vcomiss xmm16, dword ptr [rcx]
 // CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2f,0x01]
           vcomiss xmm16, DWORD PTR [rcx]
+
+// CHECK: vmovss dword ptr [rcx] {k2}, xmm13
+// CHECK:  encoding: [0x62,0x71,0x7e,0x0a,0x11,0x29]
+          vmovss dword ptr [rcx]{k2},xmm13
+
+// CHECK: vmovss dword ptr [rax + 8*r14 + 4660], xmm13
+// CHECK:  encoding: [0xc4,0x21,0x7a,0x11,0xac,0xf0,0x34,0x12,0x00,0x00]
+          vmovss dword ptr [rax+r14*8+0x1234],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0x01,0x00,0x00]
+          vmovss dword ptr [rdx+0x1fc],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0x00,0x02,0x00,0x00]
+          vmovss dword ptr [rdx+0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0x00,0xfe,0xff,0xff]
+          vmovss dword ptr [rdx-0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+          vmovss dword ptr [rdx-0x204],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0x01,0x00,0x00]
+          vmovss dword ptr [rdx+0x1fc],xmm5
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0x00,0x02,0x00,0x00]
+          vmovss dword ptr [rdx+0x200],xmm5
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0x00,0xfe,0xff,0xff]
+          vmovss dword ptr [rdx-0x200], xmm5
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+          vmovss dword ptr [rdx-0x204],xmm5
+
+// CHECK: vmovss dword ptr [rcx], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0x29]
+          vmovss dword ptr [rcx],xmm13
+
+// CHECK: vmovss xmm2, dword ptr [rcx]
+// CHECK:  encoding: [0xc5,0xfa,0x10,0x11]
+          vmovss xmm2, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4}, dword ptr [rcx]
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+          vmovss xmm2{k4}, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4} {z}, dword ptr [rcx]
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+          vmovss xmm2{k4} {z}, dword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x09]
+          vmovsd xmm25, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3}, qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+          vmovsd xmm25{k3}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3} {z}, qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+          vmovsd xmm25{k3} {z}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rax + 8*r14 + 291]
+// CHECK:  encoding: [0x62,0x21,0xff,0x08,0x10,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovsd xmm25, qword ptr [rax+r14*8+0x123]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1016]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x7f]
+          vmovsd xmm25, qword ptr [rdx+0x3f8]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1024]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0x00,0x04,0x00,0x00]
+          vmovsd xmm25, qword ptr [rdx+0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1024]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x80]
+          vmovsd xmm25, qword ptr [rdx-0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1032]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0xf8,0xfb,0xff,0xff]
+          vmovsd xmm25, qword ptr [rdx-0x408]
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index b79b21dc9691..c7ec77eadfee 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -489,10 +489,12 @@ test [ECX], AL
 // CHECK: fnstsw %ax
 // CHECK: fnstsw %ax
 // CHECK: fnstsw %ax
+// CHECK: fnstsw (%eax)
 fnstsw
 fnstsw AX
 fnstsw EAX
 fnstsw AL
+fnstsw WORD PTR [EAX]
 
 // CHECK: faddp %st(1)
 // CHECK: fmulp %st(1)
@@ -736,3 +738,16 @@ fbld tbyte ptr [eax]
 fbstp tbyte ptr [eax]
 // CHECK: fbld (%eax)
 // CHECK: fbstp (%eax)
+
+fcomip st, st(2)
+fucomip st, st(2)
+// CHECK: fcompi  %st(2)
+// CHECK: fucompi  %st(2)
+
+loopz _foo
+loopnz _foo
+// CHECK: loope _foo
+// CHECK: loopne _foo
+
+sidt fword ptr [eax]
+// CHECK: sidtq (%eax)
diff --git a/test/MC/X86/macho-reloc-errors-x86.s b/test/MC/X86/macho-reloc-errors-x86.s
new file mode 100644
index 000000000000..4af202220073
--- /dev/null
+++ b/test/MC/X86/macho-reloc-errors-x86.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=i686-apple-darwin -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+        .space 0x1000000
+        mov %eax, thing-thing2
+        mov %eax, defined-thing2
+        mov %eax, later-defined
+
+        .section __DATA,__tim
+defined:
+
+        .section __DATA,__tim2
+later:
+
+// CHECK-ERROR: 3:9: error: symbol 'thing' can not be undefined in a subtraction expression
+// CHECK-ERROR: 4:9: error: symbol 'thing2' can not be undefined in a subtraction expression
+// CHECK-ERROR: 5:9: error: Section too large, can't encode r_address (0x100000b) into 24 bits of scattered relocation entry.
diff --git a/test/MC/X86/macho-reloc-errors-x86_64.s b/test/MC/X86/macho-reloc-errors-x86_64.s
new file mode 100644
index 000000000000..05f77c495b24
--- /dev/null
+++ b/test/MC/X86/macho-reloc-errors-x86_64.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=x86_64-apple-darwin -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+        mov %rax, thing
+        mov %rax, thing@GOT-thing2@GOT
+        mov %rax, (thing-thing2)(%rip)
+        mov %rax, thing-thing
+        mov %rax, thing-thing2
+        mov %rax, thing@PLT
+        jmp thing@PLT
+        mov %rax, thing@TLVP
+
+// CHECK-ERROR: 3:9: error: 32-bit absolute addressing is not supported in 64-bit mode
+// CHECK-ERROR: 4:9: error: unsupported relocation of modified symbol
+// CHECK-ERROR: 5:9: error: unsupported pc-relative relocation of difference
+// CHECK-ERROR: 6:9: error: unsupported relocation with identical base
+// CHECK-ERROR: 7:9: error: unsupported relocation with subtraction expression, symbol 'thing' can not be undefined in a subtraction expression
+// CHECK-ERROR: 8:9: error: unsupported symbol modifier in relocation
+// CHECK-ERROR: 9:9: error: unsupported symbol modifier in branch relocation
+// CHECK-ERROR: 10:9: error: TLVP symbol modifier should have been rip-rel
diff --git a/test/Object/archive-update.test b/test/Object/archive-update.test
index ef1a71f60f43..fd1ea4113c39 100644
--- a/test/Object/archive-update.test
+++ b/test/Object/archive-update.test
@@ -7,12 +7,13 @@ Create a file named evenlen that is newer than the evenlen on the source dir.
 RUN: mkdir -p %t.older
 RUN: echo older > %t.older/evenlen
 
+RUN: mkdir -p %t.newer
+
 Either the shell supports the 'touch' command with a flag to manually set the
 mtime or we sleep for over two seconds so that the mtime is definitely
 observable.
 RUN: touch -m -t 200001010000 %t.older/evenlen || sleep 2.1
 
-RUN: mkdir -p %t.newer
 RUN: echo newer > %t.newer/evenlen
 RUN: touch %t.newer/evenlen
 
diff --git a/test/Other/opt-twice.ll b/test/Other/opt-twice.ll
new file mode 100644
index 000000000000..6bff52e34e35
--- /dev/null
+++ b/test/Other/opt-twice.ll
@@ -0,0 +1,14 @@
+; The pass here doesn't matter (we use deadargelim), but test
+; that the -run-twice options exists, generates output, and
+; doesn't crash
+; RUN: opt -run-twice -deadargelim -S < %s | FileCheck %s
+
+; CHECK: define internal void @test
+define internal {} @test() {
+  ret {} undef
+}
+
+define void @caller() {
+  call {} @test()
+  ret void
+}
diff --git a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
index 998ca7069c8b..c0b63b7d6d9e 100644
--- a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
+++ b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
@@ -10,6 +10,7 @@ declare i32 @__CxxFrameHandler3(...)
 declare void @f()
 
 declare void @g(i8*)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 ; CodeGenPrepare will want to sink these bitcasts, but it selects the catchpad
 ; blocks as the place to which the bitcast should be sunk.  Since catchpads
@@ -57,3 +58,59 @@ catchend1:
 catchend2:
   catchendpad unwind to caller
 }
+
+; CodeGenPrepare will want to hoist these llvm.dbg.value calls to the phi, but
+; there is no insertion point in a catchpad block.
+
+; CHECK-LABEL: @test_dbg_value(
+define void @test_dbg_value() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %a = alloca i8
+  %b = alloca i8
+  invoke void @f() to label %next unwind label %catch.dispatch
+next:
+  invoke void @f() to label %ret unwind label %catch.dispatch
+ret:
+  ret void
+
+catch.dispatch:
+  %p = phi i8* [%a, %entry], [%b, %next]
+  %cp1 = catchpad [] to label %catch unwind label %catchend
+
+catch:
+  tail call void @llvm.dbg.value(metadata i8* %p, i64 0, metadata !11, metadata !13), !dbg !14
+  invoke void @g(i8* %p) to label %catchret unwind label %catchend
+catchret:
+  catchret %cp1 to label %ret
+
+; CHECK: catch.dispatch:
+; CHECK-NEXT: phi i8
+; CHECK-NEXT: catchpad
+; CHECK-NOT: llvm.dbg.value
+
+; CHECK: catch:
+; CHECK-NEXT: call void @llvm.dbg.value
+
+catchend:
+  catchendpad unwind to caller
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: null, subprograms: !3)
+!1 = !DIFile(filename: "t.c", directory: "D:\5Csrc\5Cllvm\5Cbuild")
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, variables: null)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"PIC Level", i32 2}
+!10 = !{!"clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)"}
+!11 = !DILocalVariable(name: "p", scope: !4, file: !1, line: 2, type: !12)
+!12 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 8, scope: !4)
+!15 = !DILocation(line: 3, column: 1, scope: !4)
diff --git a/test/Transforms/ConstProp/insertvalue.ll b/test/Transforms/ConstProp/insertvalue.ll
index dce2b728b93b..606f7ddc679c 100644
--- a/test/Transforms/ConstProp/insertvalue.ll
+++ b/test/Transforms/ConstProp/insertvalue.ll
@@ -74,3 +74,13 @@ define i32 @test-float-Nan() {
 ; CHECK: @test-float-Nan
 ; CHECK: ret i32 2139171423
 }
+
+define i16 @test-half-Nan() {
+  %A = bitcast i16 32256 to half
+  %B = insertvalue [1 x half] undef, half %A, 0
+  %C = extractvalue [1 x half] %B, 0
+  %D = bitcast half %C to i16
+  ret i16 %D
+; CHECK: @test-half-Nan
+; CHECK: ret i16 32256
+}
diff --git a/test/Transforms/DeadStoreElimination/cycle.ll b/test/Transforms/DeadStoreElimination/cycle.ll
new file mode 100644
index 000000000000..aa2de415c604
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/cycle.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+@Table = global [535 x i32] zeroinitializer, align 4
+
+; The store in for.inc block should NOT be removed by non-local DSE.
+; CHECK: store i32 64, i32* %arrayidx
+;
+define void @foo() {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %arrayidx = getelementptr inbounds [535 x i32], [535 x i32]* @Table, i32 0, i32 %i
+  store i32 %i, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %i, 64
+  br i1 %cmp1, label %for.inc, label %for.end
+
+for.inc:
+  store i32 64, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i, 1
+  br label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthen.ll b/test/Transforms/DeadStoreElimination/ifthen.ll
new file mode 100644
index 000000000000..21c87f892568
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthen.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
+
+; The store and add in if.then block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  br label %if.end
+
+if.end:
+  %m.0 = phi i32 [ 13, %if.then ], [ 10, %entry ]
+  store i32 %m.0, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse.ll b/test/Transforms/DeadStoreElimination/ifthenelse.ll
new file mode 100644
index 000000000000..59ef17e37a5b
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthenelse.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
+
+; The add and store in entry block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %1 = add nsw i32 %c, 10
+  br label %if.end
+
+if.else:
+  %2 = add nsw i32 %c, 13
+  br label %if.end
+
+if.end:
+  %3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
+  store i32 %3, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse2.ll b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
new file mode 100644
index 000000000000..7aab03004777
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
+
+; The add and store in entry block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+; The stores in if.then and if.else blocks should be removed by non-local DSE.
+; CHECK-NOT: store i32 %1
+; CHECK-NOT: store i32 %2
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %1 = add nsw i32 %c, 10
+  store i32 %1, i32* %a, align 4
+  br label %if.end
+
+if.else:
+  %2 = add nsw i32 %c, 13
+  store i32 %2, i32* %a, align 4
+  br label %if.end
+
+if.end:
+  %3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
+  %4 = sub nsw i32 %3, 6
+  store i32 %4, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/loop.ll b/test/Transforms/DeadStoreElimination/loop.ll
new file mode 100644
index 000000000000..941e6fafaf37
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/loop.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
+
+; The store in for.body block should be removed by non-local DSE.
+; CHECK-NOT: store i32 0, i32* %arrayidx
+;
+define void @sum(i32 %N, i32* noalias nocapture %C, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp24 = icmp eq i32 %N, 0
+  br i1 %cmp24, label %for.end11, label %for.body
+
+for.body:
+  %i.025 = phi i32 [ %inc10, %for.cond1.for.inc9_crit_edge ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %C, i32 %i.025
+  store i32 0, i32* %arrayidx, align 4
+  %mul = mul i32 %i.025, %N
+  %arrayidx4.gep = getelementptr i32, i32* %A, i32 %mul
+  br label %for.body3
+
+for.body3:
+  %0 = phi i32 [ 0, %for.body ], [ %add8, %for.body3 ]
+  %arrayidx4.phi = phi i32* [ %arrayidx4.gep, %for.body ], [ %arrayidx4.inc, %for.body3 ]
+  %arrayidx5.phi = phi i32* [ %B, %for.body ], [ %arrayidx5.inc, %for.body3 ]
+  %j.023 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %1 = load i32, i32* %arrayidx4.phi, align 4
+  %2 = load i32, i32* %arrayidx5.phi, align 4
+  %add6 = add nsw i32 %2, %1
+  %add8 = add nsw i32 %add6, %0
+  %inc = add i32 %j.023, 1
+  %exitcond = icmp ne i32 %inc, %N
+  %arrayidx4.inc = getelementptr i32, i32* %arrayidx4.phi, i32 1
+  %arrayidx5.inc = getelementptr i32, i32* %arrayidx5.phi, i32 1
+  br i1 %exitcond, label %for.body3, label %for.cond1.for.inc9_crit_edge
+
+for.cond1.for.inc9_crit_edge:
+  store i32 %add8, i32* %arrayidx, align 4
+  %inc10 = add i32 %i.025, 1
+  %exitcond26 = icmp ne i32 %inc10, %N
+  br i1 %exitcond26, label %for.body, label %for.end11
+
+for.end11:
+  ret void
+}
diff --git a/test/Transforms/EarlyCSE/atomics.ll b/test/Transforms/EarlyCSE/atomics.ll
new file mode 100644
index 000000000000..ea85a86fa911
--- /dev/null
+++ b/test/Transforms/EarlyCSE/atomics.ll
@@ -0,0 +1,185 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK-LABEL: @test12(
+define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
+  %load0 = load i32, i32* %P1
+  %1 = load atomic i32, i32* %P2 seq_cst, align 4
+  %load1 = load i32, i32* %P1
+  %sel = select i1 %B, i32 %load0, i32 %load1
+  ret i32 %sel
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load i32, i32* %P1
+}
+
+; CHECK-LABEL: @test13(
+; atomic to non-atomic forwarding is legal
+define i32 @test13(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load i32, i32* %P1
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test14(
+; atomic to unordered atomic forwarding is legal
+define i32 @test14(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1 seq_cst
+  ; CHECK-NEXT: ret i32 0
+}
+
+; CHECK-LABEL: @test15(
+; implementation restriction: can't forward to stonger
+; than unordered
+define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 seq_cst, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; CHECK-LABEL: @test16(
+; forwarding non-atomic to atomic is wrong! (However,
+; it would be legal to use the later value in place of the
+; former in this particular example.  We just don't
+; do that right now.)
+define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; Can't DSE across a full fence
+define void @fence_seq_cst_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst_store
+; CHECK: store
+; CHECK: store atomic
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  store atomic i32 0, i32* %P2 seq_cst, align 4
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_seq_cst(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst
+; CHECK: store
+; CHECK: fence seq_cst
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  fence seq_cst
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_sideeffect(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_sideeffect
+; CHECK: store
+; CHECK: call void asm sideeffect
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm sideeffect "", ""()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_memory(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_memory
+; CHECK: store
+; CHECK: call void asm
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm "", "~{memory}"()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't remove a volatile load
+define i32 @volatile_load(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @volatile_load
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+}
+
+; Can't remove redundant volatile loads
+define i32 @redundant_volatile_load(i1 %B, i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @redundant_volatile_load
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: sub
+}
+
+; Can't DSE a volatile store
+define void @volatile_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @volatile_store
+; CHECK: store volatile
+; CHECK: store
+  store volatile i32 0, i32* %P1, align 4
+  store i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE a redundant volatile store
+define void @redundant_volatile_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @redundant_volatile_store
+; CHECK: store volatile
+; CHECK: store volatile
+  store volatile i32 0, i32* %P1, align 4
+  store volatile i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can value forward from volatiles
+define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test20
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; Can DSE a non-volatile store in favor of a volatile one
+; currently a missed optimization
+define void @test21(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test21
+; CHECK: store 
+; CHECK: store volatile
+  store i32 0, i32* %P1, align 4
+  store volatile i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can DSE a normal store in favor of a unordered one
+define void @test22(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test22
+; CHECK-NEXT: store atomic
+  store i32 0, i32* %P1, align 4
+  store atomic i32 3, i32* %P1 unordered, align 4
+  ret void
+}
+
+
+
diff --git a/test/Transforms/Float2Int/basic.ll b/test/Transforms/Float2Int/basic.ll
index f4d946914cd4..7f04a594dc80 100644
--- a/test/Transforms/Float2Int/basic.ll
+++ b/test/Transforms/Float2Int/basic.ll
@@ -254,3 +254,13 @@ define i32 @neg_calluser(i32 %value) {
   ret i32 %7
 }
 declare double @g(double)
+
+; CHECK-LABEL: @neg_vector
+; CHECK:  %1 = uitofp <4 x i8> %a to <4 x float>
+; CHECK:  %2 = fptoui <4 x float> %1 to <4 x i16>
+; CHECK:  ret <4 x i16> %2
+define <4 x i16> @neg_vector(<4 x i8> %a) {
+  %1 = uitofp <4 x i8> %a to <4 x float>
+  %2 = fptoui <4 x float> %1 to <4 x i16>
+  ret <4 x i16> %2
+}
diff --git a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
new file mode 100644
index 000000000000..7092a5ae2226
--- /dev/null
+++ b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; RUN: opt -S -globalopt < %s | FileCheck %s
+
+; Verify that the initialization of the available_externally global is not eliminated
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+@foo_external = available_externally global void ()* null
+
+define internal void @foo_static_init() {
+entry:
+  store void ()* @foo_impl, void ()** @foo_external
+  ret void
+}
+
+define internal void @foo_impl() {
+entry:
+  ret void
+}
+
diff --git a/test/Transforms/IndVarSimplify/pr24804.ll b/test/Transforms/IndVarSimplify/pr24804.ll
new file mode 100644
index 000000000000..6f89481853ad
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr24804.ll
@@ -0,0 +1,25 @@
+; RUN: opt -indvars -loop-idiom -loop-deletion -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Checking for a crash
+
+define void @f(i32* %a) {
+; CHECK-LABEL: @f(
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %for.cond, %entry
+  %iv = phi i32 [ 0, %entry ], [ %add, %for.inc ], [ %iv, %for.cond ]
+  %add = add nsw i32 %iv, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  br i1 undef, label %for.cond, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  br i1 undef, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/pr25578.ll b/test/Transforms/IndVarSimplify/pr25578.ll
new file mode 100644
index 000000000000..bc648b517bbe
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr25578.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @foo
+define void @foo() {
+entry:
+  br label %L1_header
+
+L1_header:
+  br label %L2_header
+
+; CHECK: L2_header:
+; CHECK: %[[INDVAR:.*]] = phi i64
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[INDVAR]] to i32
+L2_header:
+  %i = phi i32 [ 0, %L1_header ], [ %i_next, %L2_latch ]
+  %i_prom = sext i32 %i to i64
+  br label %L3_header
+
+L3_header:
+  br i1 undef, label %L3_latch, label %L2_exiting_1
+
+L3_latch:
+  br i1 undef, label %L3_header, label %L2_exiting_2
+
+L2_exiting_1:
+  br i1 undef, label %L2_latch, label %L1_latch
+
+L2_exiting_2:
+  br i1 undef, label %L2_latch, label %L1_latch
+
+L2_latch:
+  %i_next = add nsw i32 %i, 1
+  br label %L2_header
+
+L1_latch:
+; CHECK: L1_latch:
+; CHECK: %i_lcssa = phi i32 [ %[[TRUNC]], %L2_exiting_1 ], [ %[[TRUNC]], %L2_exiting_2 ]
+
+  %i_lcssa = phi i32 [ %i, %L2_exiting_1 ], [ %i, %L2_exiting_2 ]
+  br i1 undef, label %exit, label %L1_header
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/Inline/deopt-bundles.ll b/test/Transforms/Inline/deopt-bundles.ll
index 91d4690e0e61..75e8d55acdb6 100644
--- a/test/Transforms/Inline/deopt-bundles.ll
+++ b/test/Transforms/Inline/deopt-bundles.ll
@@ -2,6 +2,7 @@
 
 declare void @f()
 declare i32 @g()
+declare fastcc i32 @g.fastcc()
 
 define i32 @callee_0() alwaysinline {
  entry:
@@ -95,3 +96,72 @@ define i32 @caller_4() {
   %x = call i32 @callee_4() [ "deopt"(i32 7) ]
   ret i32 %x
 }
+
+define i32 @callee_5() alwaysinline personality i8 3 {
+ entry:
+  %v = invoke fastcc i32 @g.fastcc() #0 [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ] to label %normal unwind label %unwind
+
+ normal:
+  ret i32 %v
+
+ unwind:
+  %cleanup = landingpad i8 cleanup
+  ret i32 100
+}
+
+define i32 @caller_5() {
+; CHECK-LABEL: @caller_5(
+ entry:
+; CHECK:  invoke fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX:[0-9]+]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+  %x = call i32 @callee_5() [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
+define i32 @callee_6() alwaysinline personality i8 3 {
+ entry:
+  %v = call fastcc i32 @g.fastcc() #0 [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ]
+  ret i32 %v
+}
+
+define i32 @caller_6() {
+; CHECK-LABEL: @caller_6(
+ entry:
+; CHECK: call fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+  %x = call i32 @callee_6() [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
+define i32 @callee_7(i1 %val) alwaysinline personality i8 3 {
+; We want something that PruningFunctionCloner is not smart enough to
+; recognize, but can be recognized by recursivelySimplifyInstruction.
+
+ entry:
+  br i1 %val, label %check, label %precheck
+
+ precheck:
+  br label %check
+
+ check:
+  %p = phi i1 [ %val, %entry ], [ true, %precheck ]
+  br i1 %p, label %do.not, label %do
+
+ do.not:
+  ret i32 0
+
+ do:
+  %v = call fastcc i32 @g.fastcc() [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ]
+  ret i32 %v
+}
+
+define i32 @caller_7() {
+; CHECK-LABEL: @caller_7(
+ entry:
+; CHECK-NOT: call fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+; CHECK: ret i32 0
+  %x = call i32 @callee_7(i1 true) [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
+attributes #0 = { "foo"="bar" }
+
+; CHECK: attributes #[[FOO_BAR_ATTR_IDX]] = { "foo"="bar" }
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2a8194e53032..bccd19cc32ea 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; TODO: Both bitcasts are unnecessary; change the extractelement.
+; Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,13 +73,11 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
-; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
-; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
+; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; TODO: Second bitcast can be folded into the first.
+; Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -88,12 +86,40 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
-; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
-; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
+; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
+; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
 ; CHECK-NEXT:  ret i64 %bc2
 }
 
+; TODO: This should return %A. 
+
+define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
+  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+  %ext = extractelement <1 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to <2 x i32>
+  ret <2 x i32> %bc2
+
+; CHECK-LABEL: @bitcast_extelt3(
+; CHECK-NEXT:  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+; CHECK-NEXT:  %ext = extractelement <1 x i64> %bc1, i32 0
+; CHECK-NEXT:  %bc2 = bitcast i64 %ext to <2 x i32>
+; CHECK-NEXT:  ret <2 x i32> %bc2
+}
+
+; Handle the case where the input is not a vector.
+
+define double @bitcast_extelt4(i128 %A) {
+  %bc1 = bitcast i128 %A to <2 x i64>
+  %ext = extractelement <2 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to double
+  ret double %bc2
+
+; CHECK-LABEL: @bitcast_extelt4(
+; CHECK-NEXT:  %bc = bitcast i128 %A to <2 x double>
+; CHECK-NEXT:  %bc2 = extractelement <2 x double> %bc, i32 0
+; CHECK-NEXT:  ret double %bc2
+}
+
 define <2 x i32> @test4(i32 %A, i32 %B){
   %tmp38 = zext i32 %A to i64
   %tmp32 = zext i32 %B to i64
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 138001ace951..8fcb8214360d 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -186,3 +186,14 @@ define void @test8() {
   call void @_ZdaPvj(i8* %naj, i32 32) builtin
   ret void
 }
+
+declare noalias i8* @"\01??2@YAPEAX_K@Z"(i64) nobuiltin
+declare void @"\01??3@YAXPEAX@Z"(i8*) nobuiltin
+
+; CHECK-LABEL: @test9(
+define void @test9() {
+  ; CHECK-NOT: call
+  %new_long_long = call noalias i8* @"\01??2@YAPEAX_K@Z"(i64 32) builtin
+  call void @"\01??3@YAXPEAX@Z"(i8* %new_long_long) builtin
+  ret void
+}
diff --git a/test/Transforms/InstCombine/pow-4.ll b/test/Transforms/InstCombine/pow-4.ll
new file mode 100644
index 000000000000..76ef4c5de923
--- /dev/null
+++ b/test/Transforms/InstCombine/pow-4.ll
@@ -0,0 +1,120 @@
+; Test that the pow library call simplifier works correctly.
+
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare double @llvm.pow.f64(double, double)
+declare float @llvm.pow.f32(float, float)
+
+; pow(x, 4.0f)
+define float @test_simplify_4f(float %x) #0 {
+; CHECK-LABEL: @test_simplify_4f(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul float %x, %x
+; CHECK-NEXT: %2 = fmul float %1, %1
+; CHECK-NEXT: ret float %2
+  %1 = call float @llvm.pow.f32(float %x, float 4.000000e+00)
+  ret float %1
+}
+
+; pow(x, 3.0)
+define double @test_simplify_3(double %x) #0 {
+; CHECK-LABEL: @test_simplify_3(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: ret double %2
+  %1 = call double @llvm.pow.f64(double %x, double 3.000000e+00)
+  ret double %1
+}
+
+; pow(x, 4.0)
+define double @test_simplify_4(double %x) #0 {
+; CHECK-LABEL: @test_simplify_4(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: ret double %2
+  %1 = call double @llvm.pow.f64(double %x, double 4.000000e+00)
+  ret double %1
+}
+
+; pow(x, 15.0)
+define double @test_simplify_15(double %x) #0 {
+; CHECK-LABEL: @test_simplify_15(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %2, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double 1.500000e+01)
+  ret double %1
+}
+
+; pow(x, -7.0)
+define double @test_simplify_neg_7(double %x) #0 {
+; CHECK-LABEL: @test_simplify_neg_7(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %3 = fmul double %1, %2
+; CHECK-NEXT: %4 = fmul double %1, %3
+; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double -7.000000e+00)
+  ret double %1
+}
+
+; pow(x, -19.0)
+define double @test_simplify_neg_19(double %x) #0 {
+; CHECK-LABEL: @test_simplify_neg_19(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %1, %4
+; CHECK-NEXT: %6 = fmul double %5, %x
+; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6
+; CHECK-NEXT: ret double %7
+  %1 = call double @llvm.pow.f64(double %x, double -1.900000e+01)
+  ret double %1
+}
+
+; pow(x, 11.23)
+define double @test_simplify_11_23(double %x) #0 {
+; CHECK-LABEL: @test_simplify_11_23(
+; CHECK-NOT: fmul
+; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
+; CHECK-NEXT: ret double %1
+  %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
+  ret double %1
+}
+
+; pow(x, 32.0)
+define double @test_simplify_32(double %x) #0 {
+; CHECK-LABEL: @test_simplify_32(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %4, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double 3.200000e+01)
+  ret double %1
+}
+
+; pow(x, 33.0)
+define double @test_simplify_33(double %x) #0 {
+; CHECK-LABEL: @test_simplify_33(
+; CHECK-NOT: fmul
+; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
+; CHECK-NEXT: ret double %1
+  %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
+  ret double %1
+}
+
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/test/Transforms/InstCombine/pr25745.ll b/test/Transforms/InstCombine/pr25745.ll
new file mode 100644
index 000000000000..3bf9efc92b90
--- /dev/null
+++ b/test/Transforms/InstCombine/pr25745.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; Checking for a crash
+
+declare void @use.i1(i1 %val)
+declare void @use.i64(i64 %val)
+
+define i64 @f(i32 %x) {
+; CHECK-LABEL: @f(
+ entry:
+  %x.wide = sext i32 %x to i64
+  %minus.x = sub i32 0, %x
+  %minus.x.wide = sext i32 %minus.x to i64
+  %c = icmp slt i32 %x, 0
+  %val = select i1 %c, i64 %x.wide, i64 %minus.x.wide
+  call void @use.i1(i1 %c)
+  call void @use.i64(i64 %x.wide)
+  ret i64 %val
+; CHECK: ret i64 %val
+}
diff --git a/test/Transforms/PGOProfile/Inputs/branch1.proftext b/test/Transforms/PGOProfile/Inputs/branch1.proftext
new file mode 100644
index 000000000000..3e28112706f1
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/branch1.proftext
@@ -0,0 +1,6 @@
+test_br_1
+25571299074
+2
+3
+2
+
diff --git a/test/Transforms/PGOProfile/Inputs/branch2.proftext b/test/Transforms/PGOProfile/Inputs/branch2.proftext
new file mode 100644
index 000000000000..7d9bd72b29f2
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/branch2.proftext
@@ -0,0 +1,6 @@
+test_br_2
+29667547796
+2
+1
+1
+
diff --git a/test/Transforms/PGOProfile/Inputs/criticaledge.proftext b/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
new file mode 100644
index 000000000000..f369ba7c3504
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
@@ -0,0 +1,17 @@
+test_criticalEdge
+82323253069
+8
+2
+1
+2
+2
+0
+1
+2
+1
+
+<stdin>:bar
+12884901887
+1
+7
+
diff --git a/test/Transforms/PGOProfile/Inputs/diag.proftext b/test/Transforms/PGOProfile/Inputs/diag.proftext
new file mode 100644
index 000000000000..aaa137e3a420
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/diag.proftext
@@ -0,0 +1,5 @@
+foo
+12884999999
+1
+1
+
diff --git a/test/Transforms/PGOProfile/Inputs/landingpad.proftext b/test/Transforms/PGOProfile/Inputs/landingpad.proftext
new file mode 100644
index 000000000000..b2bd451611bf
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/landingpad.proftext
@@ -0,0 +1,14 @@
+foo
+59130013419
+4
+3
+1
+2
+0
+
+bar
+24868915205
+2
+1
+2
+
diff --git a/test/Transforms/PGOProfile/Inputs/loop1.proftext b/test/Transforms/PGOProfile/Inputs/loop1.proftext
new file mode 100644
index 000000000000..58c05fbe1676
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/loop1.proftext
@@ -0,0 +1,6 @@
+test_simple_for
+34137660316
+2
+96
+4
+
diff --git a/test/Transforms/PGOProfile/Inputs/loop2.proftext b/test/Transforms/PGOProfile/Inputs/loop2.proftext
new file mode 100644
index 000000000000..1c429ea5d5f4
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/loop2.proftext
@@ -0,0 +1,7 @@
+test_nested_for
+53929068288
+3
+33
+10
+6
+
diff --git a/test/Transforms/PGOProfile/Inputs/switch.proftext b/test/Transforms/PGOProfile/Inputs/switch.proftext
new file mode 100644
index 000000000000..7b406b87ef70
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/switch.proftext
@@ -0,0 +1,8 @@
+test_switch
+46200943743
+4
+0
+5
+2
+3
+
diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll
new file mode 100644
index 000000000000..56b40bb72970
--- /dev/null
+++ b/test/Transforms/PGOProfile/branch1.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/branch1.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_br_1 = private constant [9 x i8] c"test_br_1"
+
+define i32 @test_br_1(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: llvm.instrprof.increment
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+; USE: ![[BW_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_1, i32 0, i32 0), i64 25571299074, i32 2, i32 1)
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_1, i32 0, i32 0), i64 25571299074, i32 2, i32 0)
+  %retv = phi i32 [ %add, %if.then ], [ %i, %entry ]
+  ret i32 %retv
+}
diff --git a/test/Transforms/PGOProfile/branch2.ll b/test/Transforms/PGOProfile/branch2.ll
new file mode 100644
index 000000000000..cfeb7688fb9d
--- /dev/null
+++ b/test/Transforms/PGOProfile/branch2.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/branch2.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_br_2 = private constant [9 x i8] c"test_br_2"
+
+define i32 @test_br_2(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: llvm.instrprof.increment
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.else
+; USE: br i1 %cmp, label %if.then, label %if.else
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+; USE: ![[BW_ENTRY]] = !{!"branch_weights", i32 1, i32 1}
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_2, i32 0, i32 0), i64 29667547796, i32 2, i32 0)
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.else:
+; GEN: if.else:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_2, i32 0, i32 0), i64 29667547796, i32 2, i32 1)
+  %sub = sub nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+; GEN: if.end:
+; GEN-NOT: llvm.instrprof.increment
+  %retv = phi i32 [ %add, %if.then ], [ %sub, %if.else ]
+  ret i32 %retv
+; GEN: ret
+}
diff --git a/test/Transforms/PGOProfile/criticaledge.ll b/test/Transforms/PGOProfile/criticaledge.ll
new file mode 100644
index 000000000000..7898f3b9eb1b
--- /dev/null
+++ b/test/Transforms/PGOProfile/criticaledge.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/criticaledge.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_criticalEdge = private constant [17 x i8] c"test_criticalEdge"
+; GEN: @"__llvm_profile_name_<stdin>:bar" = private constant [11 x i8] c"<stdin>:bar"
+
+define i32 @test_criticalEdge(i32 %i, i32 %j) {
+entry:
+; CHECK: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  switch i32 %i, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb2
+; CHECK:    i32 3, label %entry.sw.bb2_crit_edge
+; CHECK:    i32 4, label %entry.sw.bb2_crit_edge1
+    i32 5, label %sw.bb2
+  ]
+; USE: ]
+; USE-SAME: !prof ![[BW_SWITCH:[0-9]+]]
+
+; CHECK: entry.sw.bb2_crit_edge1:
+; GEN:   call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 1)
+; CHECK:   br label %sw.bb2
+
+; CHECK: entry.sw.bb2_crit_edge:
+; GEN:   call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 0)
+; CHECK:   br label %sw.bb2
+
+sw.bb:
+; GEN: sw.bb:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 5)
+  %call = call i32 @bar(i32 2)
+  br label %sw.epilog
+
+sw.bb1:
+; GEN: sw.bb1:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 4)
+  %call2 = call i32 @bar(i32 1024)
+  br label %sw.epilog
+
+sw.bb2:
+; GEN: sw.bb2:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %cmp = icmp eq i32 %j, 2
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_SW_BB2:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 2)
+  %call4 = call i32 @bar(i32 4)
+  br label %return
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 3)
+  %call5 = call i32 @bar(i32 8)
+  br label %sw.epilog
+
+sw.default:
+; GEN: sw.default:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %call6 = call i32 @bar(i32 32)
+  %cmp7 = icmp sgt i32 %j, 10
+  br i1 %cmp7, label %if.then8, label %if.end9
+; USE: br i1 %cmp7, label %if.then8, label %if.end9
+; USE-SAME: !prof ![[BW_SW_DEFAULT:[0-9]+]]
+
+if.then8:
+; GEN: if.then8:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 7)
+  %add = add nsw i32 %call6, 10
+  br label %if.end9
+
+if.end9:
+; GEN: if.end9:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 6)
+  %res.0 = phi i32 [ %add, %if.then8 ], [ %call6, %sw.default ]
+  br label %sw.epilog
+
+sw.epilog:
+; GEN: sw.epilog:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %res.1 = phi i32 [ %res.0, %if.end9 ], [ %call5, %if.end ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
+  br label %return
+
+return:
+; GEN: return:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %retval = phi i32 [ %res.1, %sw.epilog ], [ %call4, %if.then ]
+  ret i32 %retval
+}
+
+define internal i32 @bar(i32 %i) {
+entry:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @"__llvm_profile_name_<stdin>:bar", i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret i32 %i
+}
+
+; USE: ![[BW_SWITCH]] = !{!"branch_weights", i32 2, i32 1, i32 0, i32 2, i32 1, i32 1}
+; USE: ![[BW_SW_BB2]] = !{!"branch_weights", i32 2, i32 2}
+; USE: ![[BW_SW_DEFAULT]] = !{!"branch_weights", i32 1, i32 1}
diff --git a/test/Transforms/PGOProfile/diag_mismatch.ll b/test/Transforms/PGOProfile/diag_mismatch.ll
new file mode 100644
index 000000000000..a2d0b20620f0
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_mismatch.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S 2>&1 | FileCheck %s
+
+; CHECK: Function control flow change detected (hash mismatch) foo
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
new file mode 100644
index 000000000000..2e5ec0444b42
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S 2>&1 | FileCheck %s
+
+; CHECK: No profile data available for function bar
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @bar() {
+entry:
+  ret i32 0 
+}
diff --git a/test/Transforms/PGOProfile/diag_no_profile.ll b/test/Transforms/PGOProfile/diag_no_profile.ll
new file mode 100644
index 000000000000..ce7b59b8f69d
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_no_profile.ll
@@ -0,0 +1,9 @@
+; RUN: not opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S  2>&1
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/landingpad.ll b/test/Transforms/PGOProfile/landingpad.ll
new file mode 100644
index 000000000000..1a8cbe1b7a36
--- /dev/null
+++ b/test/Transforms/PGOProfile/landingpad.ll
@@ -0,0 +1,124 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/landingpad.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@val = global i32 0, align 4
+@_ZTIi = external constant i8*
+; GEN: @__llvm_profile_name_bar = private constant [3 x i8] c"bar"
+; GEN: @__llvm_profile_name_foo = private constant [3 x i8] c"foo"
+
+define i32 @bar(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %rem = srem i32 %i, 3
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; USE: br i1 %tobool, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_BAR_ENTRY:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), i64 24868915205, i32 2, i32 1)
+  %exception = call i8* @__cxa_allocate_exception(i64 4)
+  %tmp = bitcast i8* %exception to i32*
+  store i32 %i, i32* %tmp, align 16
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), i64 24868915205, i32 2, i32 0)
+  ret i32 0
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+define i32 @foo(i32 %i) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %rem = srem i32 %i, 2
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; USE: br i1 %tobool, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_FOO_ENTRY:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %mul = mul nsw i32 %i, 7
+  %call = invoke i32 @bar(i32 %mul)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; GEN: invoke.cont:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 1)
+  br label %if.end
+
+lpad:
+; GEN: lpad:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %tmp = landingpad { i8*, i32 }
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
+  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
+  br label %catch.dispatch
+
+catch.dispatch:
+; GEN: catch.dispatch:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %tmp3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %tmp2, %tmp3
+  br i1 %matches, label %catch, label %eh.resume
+; USE: br i1 %matches, label %catch, label %eh.resume
+; USE-SAME: !prof ![[BW_CATCH_DISPATCH:[0-9]+]]
+
+catch:
+; GEN: catch:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 2)
+  %tmp4 = call i8* @__cxa_begin_catch(i8* %tmp1)
+  %tmp5 = bitcast i8* %tmp4 to i32*
+  %tmp6 = load i32, i32* %tmp5, align 4
+  %tmp7 = load i32, i32* @val, align 4
+  %sub = sub nsw i32 %tmp7, %tmp6
+  store i32 %sub, i32* @val, align 4
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+; GEN: try.cont:
+; GEN-NOT: call void @llvm.instrprof.increment
+  ret i32 -1
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 0)
+  %tmp8 = load i32, i32* @val, align 4
+  %add = add nsw i32 %tmp8, %i
+  store i32 %add, i32* @val, align 4
+  br label %try.cont
+
+eh.resume:
+; GEN: eh.resume:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 3)
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %tmp1, 0
+  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %tmp2, 1
+  resume { i8*, i32 } %lpad.val3
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; USE: ![[BW_BAR_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
+; USE: ![[BW_FOO_ENTRY]] = !{!"branch_weights", i32 3, i32 2}
+; USE: ![[BW_CATCH_DISPATCH]] = !{!"branch_weights", i32 2, i32 0}
diff --git a/test/Transforms/PGOProfile/loop1.ll b/test/Transforms/PGOProfile/loop1.ll
new file mode 100644
index 000000000000..ba962e4adc68
--- /dev/null
+++ b/test/Transforms/PGOProfile/loop1.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/loop1.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_simple_for = private constant [15 x i8] c"test_simple_for"
+
+define i32 @test_simple_for(i32 %n) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond
+
+for.cond:
+; GEN: for.cond:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+  %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %for.end
+; USE: br i1 %cmp, label %for.body, label %for.end
+; USE-SAME: !prof ![[BW_FOR_COND:[0-9]+]]
+; USE: ![[BW_FOR_COND]] = !{!"branch_weights", i32 96, i32 4}
+
+for.body:
+; GEN: for.body:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %inc = add nsw i32 %sum, 1
+  br label %for.inc
+
+for.inc:
+; GEN: for.inc:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_simple_for, i32 0, i32 0), i64 34137660316, i32 2, i32 0)
+  %inc1 = add nsw i32 %i, 1
+  br label %for.cond
+
+for.end:
+; GEN: for.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_simple_for, i32 0, i32 0), i64 34137660316, i32 2, i32 1)
+  ret i32 %sum
+}
diff --git a/test/Transforms/PGOProfile/loop2.ll b/test/Transforms/PGOProfile/loop2.ll
new file mode 100644
index 000000000000..04b3dcb55cfc
--- /dev/null
+++ b/test/Transforms/PGOProfile/loop2.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/loop2.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_nested_for = private constant [15 x i8] c"test_nested_for"
+
+define i32 @test_nested_for(i32 %r, i32 %s) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond.outer
+
+for.cond.outer:
+; GEN: for.cond.outer:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %i.0 = phi i32 [ 0, %entry ], [ %inc.2, %for.inc.outer ]
+  %sum.0 = phi i32 [ 1, %entry ], [ %sum.1, %for.inc.outer ]
+  %cmp = icmp slt i32 %i.0, %r
+  br i1 %cmp, label %for.body.outer, label %for.end.outer
+; USE: br i1 %cmp, label %for.body.outer, label %for.end.outer
+; USE-SAME: !prof ![[BW_FOR_COND_OUTER:[0-9]+]]
+
+for.body.outer:
+; GEN: for.body.outer:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond.inner
+
+for.cond.inner:
+; GEN: for.cond.inner:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %j.0 = phi i32 [ 0, %for.body.outer ], [ %inc.1, %for.inc.inner ]
+  %sum.1 = phi i32 [ %sum.0, %for.body.outer ], [ %inc, %for.inc.inner ]
+  %cmp2 = icmp slt i32 %j.0, %s
+  br i1 %cmp2, label %for.body.inner, label %for.end.inner
+; USE: br i1 %cmp2, label %for.body.inner, label %for.end.inner
+; USE-SAME: !prof ![[BW_FOR_COND_INNER:[0-9]+]]
+
+for.body.inner:
+; GEN: for.body.inner:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %inc = add nsw i32 %sum.1, 1
+  br label %for.inc.inner
+
+for.inc.inner:
+; GEN: for.inc.inner:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 0)
+  %inc.1 = add nsw i32 %j.0, 1
+  br label %for.cond.inner
+
+for.end.inner:
+; GEN: for.end.inner:
+  br label %for.inc.outer
+
+for.inc.outer:
+; GEN: for.inc.outer:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 1)
+  %inc.2 = add nsw i32 %i.0, 1
+  br label %for.cond.outer
+
+for.end.outer:
+; GEN: for.end.outer:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 2)
+  ret i32 %sum.0
+}
+
+; USE-DAG: ![[BW_FOR_COND_OUTER]] = !{!"branch_weights", i32 10, i32 6}
+; USE-DAG: ![[BW_FOR_COND_INNER]] = !{!"branch_weights", i32 33, i32 10}
+
diff --git a/test/Transforms/PGOProfile/single_bb.ll b/test/Transforms/PGOProfile/single_bb.ll
new file mode 100644
index 000000000000..8d976318ad6d
--- /dev/null
+++ b/test/Transforms/PGOProfile/single_bb.ll
@@ -0,0 +1,12 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_single_bb = private constant [9 x i8] c"single_bb"
+
+define i32 @single_bb() {
+entry:
+; GEN: entry:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_single_bb, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/switch.ll b/test/Transforms/PGOProfile/switch.ll
new file mode 100644
index 000000000000..4e99c10792f5
--- /dev/null
+++ b/test/Transforms/PGOProfile/switch.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/switch.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_switch = private constant [11 x i8] c"test_switch"
+
+define void @test_switch(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  switch i32 %i, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+  ]
+; USE: ]
+; USE-SAME: !prof ![[BW_SWITCH:[0-9]+]]
+; USE: ![[BW_SWITCH]] = !{!"branch_weights", i32 3, i32 2, i32 0, i32 5}
+
+sw.bb:
+; GEN: sw.bb:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 2)
+  br label %sw.epilog
+
+sw.bb1:
+; GEN: sw.bb1:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+; GEN: sw.bb2:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 1)
+  br label %sw.epilog
+
+sw.default:
+; GEN: sw.default:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+; GEN: sw.epilog:
+; GEN-NOT: call void @llvm.instrprof.increment
+  ret void
+; GEN: ret void
+}
diff --git a/test/Transforms/PruneEH/operand-bundles.ll b/test/Transforms/PruneEH/operand-bundles.ll
new file mode 100644
index 000000000000..efe8f62a8fb2
--- /dev/null
+++ b/test/Transforms/PruneEH/operand-bundles.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @caller(
+; CHECK-NOT: invoke
+; CHECK: call void @foo() [ "foo"(i32 0, i8 1) ]
+	invoke void @foo() [ "foo"(i32 0, i8 1) ]
+			to label %Normal unwind label %Except
+
+Normal:		; preds = %0
+	ret i32 0
+
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SimplifyCFG/invoke_unwind.ll b/test/Transforms/SimplifyCFG/invoke_unwind.ll
index 3b4c09d96f77..100bfd4e9e3e 100644
--- a/test/Transforms/SimplifyCFG/invoke_unwind.ll
+++ b/test/Transforms/SimplifyCFG/invoke_unwind.ll
@@ -17,4 +17,17 @@ Rethrow:
         resume { i8*, i32 } %exn
 }
 
+define i32 @test2() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: call void @bar() [ "foo"(i32 100) ]
+; CHECK-NEXT: ret i32 0
+        invoke void @bar( ) [ "foo"(i32 100) ]
+                        to label %1 unwind label %Rethrow
+        ret i32 0
+Rethrow:
+        %exn = landingpad {i8*, i32}
+                 catch i8* null
+        resume { i8*, i32 } %exn
+}
+
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/Verifier/func-dbg.ll b/test/Verifier/func-dbg.ll
new file mode 100644
index 000000000000..e56de94d18c9
--- /dev/null
+++ b/test/Verifier/func-dbg.ll
@@ -0,0 +1,25 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define i32 @foo() !dbg !4 {
+entry:
+  ret i32 0, !dbg !6
+}
+
+define i32 @bar() !dbg !5 {
+entry:
+; CHECK: !dbg attachment points at wrong subprogram for function
+  ret i32 0, !dbg !6
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !DIFile(filename: "dwarf-test.c", directory: "test")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = distinct !DISubprogram(name: "foo", scope: !0, isDefinition: true)
+!5 = distinct !DISubprogram(name: "bar", scope: !0, isDefinition: true)
+!6 = !DILocation(line: 7, scope: !4)
+!7 = !{i32 2, !"Dwarf Version", i32 3}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/tools/gold/X86/drop-linkage.ll b/test/tools/gold/X86/drop-linkage.ll
index 14d3a96718a8..d4c1dd052c79 100644
--- a/test/tools/gold/X86/drop-linkage.ll
+++ b/test/tools/gold/X86/drop-linkage.ll
@@ -11,4 +11,4 @@ define void @foo() {
   ret void
 }
 
-; CHECK: declare void @foo(){{$}}
+; CHECK: declare extern_weak void @foo(){{$}}
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/notypes/a.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/a.dwo
rename to test/tools/llvm-dwp/Inputs/simple/notypes/a.dwo
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/notypes/b.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/b.dwo
rename to test/tools/llvm-dwp/Inputs/simple/notypes/b.dwo
diff --git a/test/tools/llvm-dwp/Inputs/simple/types/a.dwo b/test/tools/llvm-dwp/Inputs/simple/types/a.dwo
new file mode 100644
index 000000000000..1fc71ca8d175
Binary files /dev/null and b/test/tools/llvm-dwp/Inputs/simple/types/a.dwo differ
diff --git a/test/tools/llvm-dwp/Inputs/simple/types/b.dwo b/test/tools/llvm-dwp/Inputs/simple/types/b.dwo
new file mode 100644
index 000000000000..a69cd69eb995
Binary files /dev/null and b/test/tools/llvm-dwp/Inputs/simple/types/b.dwo differ
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.cpp b/test/tools/llvm-dwp/Inputs/type_units/a.cpp
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/a.cpp
rename to test/tools/llvm-dwp/Inputs/type_units/a.cpp
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.cpp b/test/tools/llvm-dwp/Inputs/type_units/b.cpp
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/b.cpp
rename to test/tools/llvm-dwp/Inputs/type_units/b.cpp
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 6ee19697442d..d7365c814435 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -1,5 +1,8 @@
-RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
-RUN: llvm-dwarfdump %t | FileCheck %s
+RUN: llvm-dwp %p/../Inputs/simple/notypes/a.dwo %p/../Inputs/simple/notypes/b.dwo -o %t
+RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=NOTYP %s
+RUN: llvm-objdump -h %t | FileCheck --check-prefix=NOTYPOBJ %s
+RUN: llvm-dwp %p/../Inputs/simple/types/a.dwo %p/../Inputs/simple/types/b.dwo -o %t
+RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=TYPES %s
 
 FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.
 
@@ -13,48 +16,82 @@ b.cpp:
   void b(bar) {
   }
 
-CHECK: .debug_abbrev.dwo contents:
-CHECK: Abbrev table for offset: 0x00000000
+CHECK-LABEL: .debug_abbrev.dwo contents:
+CHECK-LABEL: Abbrev table for offset:
+CHECK: 0x0000[[AAOFF:.*]]
 CHECK: DW_TAG_compile_unit
 CHECK: DW_TAG_variable
 CHECK: DW_TAG_structure_type
-CHECK: Abbrev table for offset: 0x00000031
+CHECK-LABEL: Abbrev table for offset:
+CHECK: 0x0000[[BAOFF:.*]]
 CHECK: DW_TAG_compile_unit
 CHECK: DW_TAG_structure_type
 CHECK: DW_TAG_subprogram
 CHECK: DW_TAG_formal_parameter
 
 CHECK: .debug_info.dwo contents:
-CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029)
+CHECK: [[AOFF:0x[0-9a-f]*]]:
+CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
+CHECK:         0x[[AAOFF]] addr_size = 0x08 (next unit at [[BOFF:.*]])
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
+CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
 CHECK:   DW_TAG_variable
 CHECK:     DW_AT_name {{.*}} "a"
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_name {{.*}} "foo"
+NOTYP:     DW_AT_name {{.*}} "foo"
+TYPES:     DW_AT_signature {{.*}} ([[FOOSIG:.*]])
 
-CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0031 addr_size = 0x08 (next unit at 0x0000005e)
+CHECK: [[BOFF]]:
+CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
+CHECK:         0x[[BAOFF]] addr_size = 0x08 (next unit at [[XOFF:.*]])
 CHECK:   DW_AT_name {{.*}} "b.cpp"
+CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_name {{.*}} "bar"
+NOTYP:     DW_AT_name {{.*}} "bar"
+TYPES:     DW_AT_signature {{.*}} ([[BARSIG:.*]])
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "b"
 CHECK:     DW_TAG_formal_parameter
 
-CHECK: .debug_cu_index contents:
-Ensure only the relevant/contained sections are included in the table:
-CHECK: Index Signature          INFO                     ABBREV                   STR_OFFSETS
-Don't bother checking the Signatures, they aren't correct yet.
-CHECK:                          [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
-CHECK:                          [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
+NOTYP-NOT: .debug_types.dwo contents:
+TYPES-LABEL: .debug_types.dwo contents:
+TYPES: [[FOOUOFF:0x[0-9a-f]*]]:
+TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+TYPES:         0x[[AAOFF]] addr_size = 0x08 type_signature = [[FOOSIG]] type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
+TYPES:             DW_TAG_type_unit
+TYPES: [[FOOOFF]]:   DW_TAG_structure_type
+TYPES:                 DW_AT_name {{.*}} "foo"
+TYPES: [[BARUOFF]]:
+TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+TYPES:         0x[[BAOFF]] addr_size = 0x08 type_signature = [[BARSIG]] type_offset = 0x001e (next unit at [[XUOFF:.*]])
+TYPES:             DW_TAG_type_unit
+TYPES: 0x00000042:   DW_TAG_structure_type
+TYPES:                 DW_AT_name {{.*}} "bar"
 
-CHECK: .debug_str.dwo contents:
+CHECK-LABEL: .debug_cu_index contents:
+CHECK: Index Signature INFO                      ABBREV                             LINE                     STR_OFFSETS
+TYPES:     1 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+TYPES:     3 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+NOTYP:     3 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
+NOTYP:     4 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000075)      [0x00000011, 0x00000022) [0x00000010, 0x00000024)
+
+CHECK-LABEL: .debug_tu_index contents:
+NOTYP-NOT: Index
+TYPES: Index Signature  TYPES                           ABBREV                             LINE                     STR_OFFSETS
+TYPES:     1 [[FOOSIG]] {{\[}}[[FOOUOFF]], [[BARUOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+TYPES:     4 [[BARSIG]] {{\[}}[[BARUOFF]], [[XUOFF]])   [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+
+Ensure we do not create a debug_tu_index, even an empty or malformed one.
+NOTYPOBJ-NOT: .debug_tu_index
+
+CHECK-LABEL: .debug_str.dwo contents:
 CHECK: "clang version
 CHECK: 0x[[ACPP:.*]]: "a.cpp"
 CHECK-NOT: "clang version
 CHECK: 0x[[BCPP:.*]]: "b.cpp"
 
-CHECK: .debug_str_offsets.dwo contents:
+CHECK-LABEL: .debug_str_offsets.dwo contents:
 CHECK: : 00000000
 CHECK: : [[ACPP]]
 CHECK: : 00000000
diff --git a/test/tools/sancov/covered_functions.test b/test/tools/sancov/covered_functions.test
index 02dd30210238..5e0696bf8615 100644
--- a/test/tools/sancov/covered_functions.test
+++ b/test/tools/sancov/covered_functions.test
@@ -1,14 +1,13 @@
 REQUIRES: x86_64-linux
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered_functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered_functions %p/Inputs/test-linux_x86_64-1.sancov | FileCheck --check-prefix=MULTIPLE_FILES %s
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -demangle=0 -covered_functions %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=NO_DEMANGLE %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered-functions -strip_path_prefix=Inputs/ %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=STRIP_PATH %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -demangle=0 -covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=NO_DEMANGLE %s
 
 CHECK: Inputs{{[/\\]}}test.cpp:12 bar(std::string)
 CHECK: Inputs{{[/\\]}}test.cpp:14 main
 
-MULTIPLE_FILES: {{^}}foo.cpp:5 foo()
-MULTIPLE_FILES: {{^}}test.cpp:12 bar(std::string)
-MULTIPLE_FILES: {{^}}test.cpp:14 main
+STRIP_PATH: {{^}}test.cpp:12 bar(std::string)
+STRIP_PATH: {{^}}test.cpp:14 main
 
 NO_DEMANGLE: test.cpp:12 _Z3barSs
 NO_DEMANGLE: test.cpp:14 main
diff --git a/test/tools/sancov/not_covered_functions.test b/test/tools/sancov/not_covered_functions.test
new file mode 100644
index 000000000000..b82f9e22d5d8
--- /dev/null
+++ b/test/tools/sancov/not_covered_functions.test
@@ -0,0 +1,7 @@
+REQUIRES: x86_64-linux
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -not-covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -not-covered-functions %p/Inputs/test-linux_x86_64-1.sancov | FileCheck --check-prefix=CHECK1 --allow-empty %s
+
+CHECK: Inputs{{[/\\]}}foo.cpp:5 foo()
+CHECK1-NOT: {{.}}*
+
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index 39887d5d59dc..9edc242d470e 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -15,6 +15,7 @@
 
 #include "BugDriver.h"
 #include "ToolRunner.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
@@ -112,6 +113,12 @@ std::unique_ptr<Module> llvm::parseInputFile(StringRef Filename,
   return Result;
 }
 
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
+}
+
 // This method takes the specified list of LLVM input files, attempts to load
 // them, either as assembly or bitcode, then link them together. It returns
 // true on failure (if, for example, an input bitcode file could not be
@@ -132,7 +139,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
     if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
-    if (Linker::linkModules(*Program, *M))
+    if (Linker::linkModules(*Program, *M, diagnosticHandler))
       return true;
   }
 
diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index 45fcf74aa6bf..20efff3fda5f 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h
@@ -331,11 +331,11 @@ void DeleteGlobalInitializer(GlobalVariable *GV);
 //
 void DeleteFunctionBody(Function *F);
 
-/// SplitFunctionsOutOfModule - Given a module and a list of functions in the
-/// module, split the functions OUT of the specified module, and place them in
-/// the new module.
-Module *SplitFunctionsOutOfModule(Module *M, const std::vector<Function*> &F,
-                                  ValueToValueMapTy &VMap);
+/// Given a module and a list of functions in the module, split the functions
+/// OUT of the specified module, and place them in the new module.
+std::unique_ptr<Module>
+SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+                          ValueToValueMapTy &VMap);
 
 } // End llvm namespace
 
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index 631a58455c5b..6cdc43ab8699 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -143,7 +143,7 @@ ReduceCrashingGlobalVariables::TestGlobalVariables(
                               std::vector<GlobalVariable*> &GVs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   std::set<GlobalVariable*> GVSet;
@@ -239,7 +239,7 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function*> &Funcs) {
 
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   std::set<Function*> Functions;
@@ -346,7 +346,7 @@ namespace {
 bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock*> &BBs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   SmallPtrSet<BasicBlock*, 8> Blocks;
@@ -456,7 +456,7 @@ bool ReduceCrashingInstructions::TestInsts(std::vector<const Instruction*>
                                            &Insts) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   SmallPtrSet<Instruction*, 64> Instructions;
@@ -532,7 +532,7 @@ class ReduceCrashingNamedMD : public ListReducer<std::string> {
 bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   outs() << "Checking for crash with only these named metadata nodes:";
   unsigned NumPrint = std::min<size_t>(NamedMDs.size(), 10);
@@ -612,7 +612,7 @@ bool ReduceCrashingNamedMDOps::TestNamedMDOps(
     outs() << " named metadata operands: ";
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // This is a little wasteful. In the future it might be good if we could have
   // these dropped during cloning.
@@ -658,7 +658,7 @@ static bool DebugACrash(BugDriver &BD,
       BD.getProgram()->global_begin() != BD.getProgram()->global_end()) {
     // Now try to reduce the number of global variable initializers in the
     // module to something small.
-    Module *M = CloneModule(BD.getProgram());
+    Module *M = CloneModule(BD.getProgram()).release();
     bool DeletedInit = false;
 
     for (Module::global_iterator I = M->global_begin(), E = M->global_end();
@@ -840,7 +840,7 @@ static bool DebugACrash(BugDriver &BD,
   // Try to clean up the testcase by running funcresolve and globaldce...
   if (!BugpointIsInterrupted) {
     outs() << "\n*** Attempting to perform final cleanups: ";
-    Module *M = CloneModule(BD.getProgram());
+    Module *M = CloneModule(BD.getProgram()).release();
     M = BD.performFinalCleanups(M, true).release();
 
     // Find out if the pass still crashes on the cleaned up program...
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 7b98cb8fb55c..fe0ab69dc162 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -86,7 +86,7 @@ std::unique_ptr<Module>
 BugDriver::deleteInstructionFromProgram(const Instruction *I,
                                         unsigned Simplification) {
   // FIXME, use vmap?
-  Module *Clone = CloneModule(Program);
+  Module *Clone = CloneModule(Program).release();
 
   const BasicBlock *PBB = I->getParent();
   const Function *PF = PBB->getParent();
@@ -303,13 +303,8 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
   }
 }
 
-
-/// SplitFunctionsOutOfModule - Given a module and a list of functions in the
-/// module, split the functions OUT of the specified module, and place them in
-/// the new module.
-Module *
-llvm::SplitFunctionsOutOfModule(Module *M,
-                                const std::vector<Function*> &F,
+std::unique_ptr<Module>
+llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
                                 ValueToValueMapTy &VMap) {
   // Make sure functions & globals are all external so that linkage
   // between the two modules will work.
@@ -323,7 +318,7 @@ llvm::SplitFunctionsOutOfModule(Module *M,
   }
 
   ValueToValueMapTy NewVMap;
-  Module *New = CloneModule(M, NewVMap);
+  std::unique_ptr<Module> New = CloneModule(M, NewVMap);
 
   // Remove the Test functions from the Safe module
   std::set<Function *> TestFunctions;
@@ -364,9 +359,9 @@ llvm::SplitFunctionsOutOfModule(Module *M,
 
   // Make sure that there is a global ctor/dtor array in both halves of the
   // module if they both have static ctor/dtor functions.
-  SplitStaticCtorDtor("llvm.global_ctors", M, New, NewVMap);
-  SplitStaticCtorDtor("llvm.global_dtors", M, New, NewVMap);
-  
+  SplitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
+  SplitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
+
   return New;
 }
 
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index e7eae40ec95a..5c9f0271cecb 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Config/config.h"   // for HAVE_LINK_R
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -176,12 +177,15 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
 namespace {
   class ReduceMiscompilingFunctions : public ListReducer<Function*> {
     BugDriver &BD;
-    bool (*TestFn)(BugDriver &, Module *, Module *, std::string &);
+    bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                   std::unique_ptr<Module>, std::string &);
+
   public:
     ReduceMiscompilingFunctions(BugDriver &bd,
-                                bool (*F)(BugDriver &, Module *, Module *,
+                                bool (*F)(BugDriver &, std::unique_ptr<Module>,
+                                          std::unique_ptr<Module>,
                                           std::string &))
-      : BD(bd), TestFn(F) {}
+        : BD(bd), TestFn(F) {}
 
     TestResult doTest(std::vector<Function*> &Prefix,
                       std::vector<Function*> &Suffix,
@@ -207,32 +211,32 @@ namespace {
   };
 }
 
-/// TestMergedProgram - Given two modules, link them together and run the
-/// program, checking to see if the program matches the diff. If there is
-/// an error, return NULL. If not, return the merged module. The Broken argument
-/// will be set to true if the output is different. If the DeleteInputs
-/// argument is set to true then this function deletes both input
-/// modules before it returns.
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
+  if (DI.getSeverity() == DS_Error)
+    exit(1);
+}
+
+/// Given two modules, link them together and run the program, checking to see
+/// if the program matches the diff. If there is an error, return NULL. If not,
+/// return the merged module. The Broken argument will be set to true if the
+/// output is different. If the DeleteInputs argument is set to true then this
+/// function deletes both input modules before it returns.
 ///
-static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
-                                 bool DeleteInputs, std::string &Error,
-                                 bool &Broken) {
-  // Link the two portions of the program back to together.
-  if (!DeleteInputs) {
-    M1 = CloneModule(M1);
-    M2 = CloneModule(M2);
-  }
-  if (Linker::linkModules(*M1, *M2))
+static std::unique_ptr<Module> testMergedProgram(const BugDriver &BD,
+                                                 std::unique_ptr<Module> M1,
+                                                 std::unique_ptr<Module> M2,
+                                                 std::string &Error,
+                                                 bool &Broken) {
+  if (Linker::linkModules(*M1, *M2, diagnosticHandler))
     exit(1);
-  delete M2;   // We are done with this module.
 
   // Execute the program.
-  Broken = BD.diffProgram(M1, "", "", false, &Error);
-  if (!Error.empty()) {
-    // Delete the linked module
-    delete M1;
+  Broken = BD.diffProgram(M1.get(), "", "", false, &Error);
+  if (!Error.empty())
     return nullptr;
-  }
   return M1;
 }
 
@@ -259,7 +263,7 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
   //   we can conclude that a function triggers the bug when in fact one
   //   needs a larger set of original functions to do so.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap);
+  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
   Module *Orig = BD.swapProgramIn(Clone);
 
   std::vector<Function*> FuncsOnClone;
@@ -270,12 +274,12 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
 
   // Split the module into the two halves of the program we want.
   VMap.clear();
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone,
-                                                 VMap);
+  std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
-  // Run the predicate, note that the predicate will delete both input modules.
-  bool Broken = TestFn(BD, ToOptimize, ToNotOptimize, Error);
+  bool Broken =
+      TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize), Error);
 
   delete BD.swapProgramIn(Orig);
 
@@ -294,29 +298,29 @@ static void DisambiguateGlobalSymbols(Module *M) {
       I->setName("anon_fn");
 }
 
-/// ExtractLoops - Given a reduced list of functions that still exposed the bug,
-/// check to see if we can extract the loops in the region without obscuring the
-/// bug.  If so, it reduces the amount of code identified.
+/// Given a reduced list of functions that still exposed the bug, check to see
+/// if we can extract the loops in the region without obscuring the bug.  If so,
+/// it reduces the amount of code identified.
 ///
 static bool ExtractLoops(BugDriver &BD,
-                         bool (*TestFn)(BugDriver &, Module *, Module *,
-                                        std::string &),
-                         std::vector<Function*> &MiscompiledFunctions,
+                         bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                        std::unique_ptr<Module>, std::string &),
+                         std::vector<Function *> &MiscompiledFunctions,
                          std::string &Error) {
   bool MadeChange = false;
   while (1) {
     if (BugpointIsInterrupted) return MadeChange;
 
     ValueToValueMapTy VMap;
-    Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
-    Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
-                                                   MiscompiledFunctions,
-                                                   VMap);
-    Module *ToOptimizeLoopExtracted = BD.extractLoop(ToOptimize).release();
+    std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+    Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize.get(),
+                                                   MiscompiledFunctions, VMap)
+                             .release();
+    std::unique_ptr<Module> ToOptimizeLoopExtracted =
+        BD.extractLoop(ToOptimize);
     if (!ToOptimizeLoopExtracted) {
       // If the loop extractor crashed or if there were no extractible loops,
       // then this chapter of our odyssey is over with.
-      delete ToNotOptimize;
       delete ToOptimize;
       return MadeChange;
     }
@@ -330,13 +334,14 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction.
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
-    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted,
-                                    ToNotOptimize, false, Error, Failure);
+    std::unique_ptr<Module> New =
+        testMergedProgram(BD, std::move(ToOptimizeLoopExtracted),
+                          std::move(ToNotOptimize), Error, Failure);
     if (!New)
       return false;
 
     // Delete the original and set the new program.
-    Module *Old = BD.swapProgramIn(New);
+    Module *Old = BD.swapProgramIn(New.release());
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
     delete Old;
@@ -350,16 +355,15 @@ static bool ExtractLoops(BugDriver &BD,
       errs() << "      Continuing on with un-loop-extracted version.\n";
 
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-tno.bc",
-                            ToNotOptimize);
+                            ToNotOptimize.get());
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to.bc",
                             ToOptimize);
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to-le.bc",
-                            ToOptimizeLoopExtracted);
+                            ToOptimizeLoopExtracted.get());
 
       errs() << "Please submit the "
              << OutputPrefix << "-loop-extract-fail-*.bc files.\n";
       delete ToOptimize;
-      delete ToNotOptimize;
       return MadeChange;
     }
     delete ToOptimize;
@@ -367,18 +371,20 @@ static bool ExtractLoops(BugDriver &BD,
 
     outs() << "  Testing after loop extraction:\n";
     // Clone modules, the tester function will free them.
-    Module *TOLEBackup = CloneModule(ToOptimizeLoopExtracted, VMap);
-    Module *TNOBackup  = CloneModule(ToNotOptimize, VMap);
+    std::unique_ptr<Module> TOLEBackup =
+        CloneModule(ToOptimizeLoopExtracted.get(), VMap);
+    std::unique_ptr<Module> TNOBackup = CloneModule(ToNotOptimize.get(), VMap);
 
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
 
-    Failure = TestFn(BD, ToOptimizeLoopExtracted, ToNotOptimize, Error);
+    Failure = TestFn(BD, std::move(ToOptimizeLoopExtracted),
+                     std::move(ToNotOptimize), Error);
     if (!Error.empty())
       return false;
 
-    ToOptimizeLoopExtracted = TOLEBackup;
-    ToNotOptimize = TNOBackup;
+    ToOptimizeLoopExtracted = std::move(TOLEBackup);
+    ToNotOptimize = std::move(TNOBackup);
 
     if (!Failure) {
       outs() << "*** Loop extraction masked the problem.  Undoing.\n";
@@ -390,7 +396,8 @@ static bool ExtractLoops(BugDriver &BD,
         MisCompFunctions.emplace_back(F->getName(), F->getFunctionType());
       }
 
-      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
+      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted,
+                              diagnosticHandler))
         exit(1);
 
       MiscompiledFunctions.clear();
@@ -401,8 +408,7 @@ static bool ExtractLoops(BugDriver &BD,
         MiscompiledFunctions.push_back(NewF);
       }
 
-      delete ToOptimizeLoopExtracted;
-      BD.setNewProgram(ToNotOptimize);
+      BD.setNewProgram(ToNotOptimize.release());
       return MadeChange;
     }
 
@@ -418,11 +424,10 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction both didn't break the program, and didn't mask the problem.
     // Replace the current program with the loop extracted version, and try to
     // extract another loop.
-    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
+    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted,
+                            diagnosticHandler))
       exit(1);
 
-    delete ToOptimizeLoopExtracted;
-
     // All of the Function*'s in the MiscompiledFunctions list are in the old
     // module.  Update this list to include all of the functions in the
     // optimized and loop extracted module.
@@ -434,7 +439,7 @@ static bool ExtractLoops(BugDriver &BD,
       MiscompiledFunctions.push_back(NewF);
     }
 
-    BD.setNewProgram(ToNotOptimize);
+    BD.setNewProgram(ToNotOptimize.release());
     MadeChange = true;
   }
 }
@@ -442,14 +447,15 @@ static bool ExtractLoops(BugDriver &BD,
 namespace {
   class ReduceMiscompiledBlocks : public ListReducer<BasicBlock*> {
     BugDriver &BD;
-    bool (*TestFn)(BugDriver &, Module *, Module *, std::string &);
+    bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                   std::unique_ptr<Module>, std::string &);
     std::vector<Function*> FunctionsBeingTested;
   public:
     ReduceMiscompiledBlocks(BugDriver &bd,
-                            bool (*F)(BugDriver &, Module *, Module *,
-                                      std::string &),
-                            const std::vector<Function*> &Fns)
-      : BD(bd), TestFn(F), FunctionsBeingTested(Fns) {}
+                            bool (*F)(BugDriver &, std::unique_ptr<Module>,
+                                      std::unique_ptr<Module>, std::string &),
+                            const std::vector<Function *> &Fns)
+        : BD(bd), TestFn(F), FunctionsBeingTested(Fns) {}
 
     TestResult doTest(std::vector<BasicBlock*> &Prefix,
                       std::vector<BasicBlock*> &Suffix,
@@ -495,7 +501,7 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap);
+  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
   Module *Orig = BD.swapProgramIn(Clone);
   std::vector<Function*> FuncsOnClone;
   std::vector<BasicBlock*> BBsOnClone;
@@ -509,37 +515,30 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
   }
   VMap.clear();
 
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
-                                                 FuncsOnClone,
-                                                 VMap);
+  std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
   // Try the extraction.  If it doesn't work, then the block extractor crashed
   // or something, in which case bugpoint can't chase down this possibility.
   if (std::unique_ptr<Module> New =
-          BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize)) {
-    delete ToOptimize;
-    // Run the predicate,
-    // note that the predicate will delete both input modules.
-    bool Ret = TestFn(BD, New.get(), ToNotOptimize, Error);
+          BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize.get())) {
+    bool Ret = TestFn(BD, std::move(New), std::move(ToNotOptimize), Error);
     delete BD.swapProgramIn(Orig);
     return Ret;
   }
   delete BD.swapProgramIn(Orig);
-  delete ToOptimize;
-  delete ToNotOptimize;
   return false;
 }
 
-
-/// ExtractBlocks - Given a reduced list of functions that still expose the bug,
-/// extract as many basic blocks from the region as possible without obscuring
-/// the bug.
+/// Given a reduced list of functions that still expose the bug, extract as many
+/// basic blocks from the region as possible without obscuring the bug.
 ///
 static bool ExtractBlocks(BugDriver &BD,
-                          bool (*TestFn)(BugDriver &, Module *, Module *,
+                          bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                         std::unique_ptr<Module>,
                                          std::string &),
-                          std::vector<Function*> &MiscompiledFunctions,
+                          std::vector<Function *> &MiscompiledFunctions,
                           std::string &Error) {
   if (BugpointIsInterrupted) return false;
 
@@ -570,10 +569,10 @@ static bool ExtractBlocks(BugDriver &BD,
   }
 
   ValueToValueMapTy VMap;
-  Module *ProgClone = CloneModule(BD.getProgram(), VMap);
-  Module *ToExtract = SplitFunctionsOutOfModule(ProgClone,
-                                                MiscompiledFunctions,
-                                                VMap);
+  Module *ProgClone = CloneModule(BD.getProgram(), VMap).release();
+  Module *ToExtract =
+      SplitFunctionsOutOfModule(ProgClone, MiscompiledFunctions, VMap)
+          .release();
   std::unique_ptr<Module> Extracted =
       BD.extractMappedBlocksFromModule(Blocks, ToExtract);
   if (!Extracted) {
@@ -594,7 +593,7 @@ static bool ExtractBlocks(BugDriver &BD,
     if (!I->isDeclaration())
       MisCompFunctions.emplace_back(I->getName(), I->getFunctionType());
 
-  if (Linker::linkModules(*ProgClone, *Extracted))
+  if (Linker::linkModules(*ProgClone, *Extracted, diagnosticHandler))
     exit(1);
 
   // Set the new program and delete the old one.
@@ -612,14 +611,13 @@ static bool ExtractBlocks(BugDriver &BD,
   return true;
 }
 
-
-/// DebugAMiscompilation - This is a generic driver to narrow down
-/// miscompilations, either in an optimization or a code generator.
+/// This is a generic driver to narrow down miscompilations, either in an
+/// optimization or a code generator.
 ///
-static std::vector<Function*>
+static std::vector<Function *>
 DebugAMiscompilation(BugDriver &BD,
-                     bool (*TestFn)(BugDriver &, Module *, Module *,
-                                    std::string &),
+                     bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                    std::unique_ptr<Module>, std::string &),
                      std::string &Error) {
   // Okay, now that we have reduced the list of passes which are causing the
   // failure, see if we can pin down which functions are being
@@ -698,28 +696,28 @@ DebugAMiscompilation(BugDriver &BD,
   return MiscompiledFunctions;
 }
 
-/// TestOptimizer - This is the predicate function used to check to see if the
-/// "Test" portion of the program is misoptimized.  If so, return true.  In any
-/// case, both module arguments are deleted.
+/// This is the predicate function used to check to see if the "Test" portion of
+/// the program is misoptimized.  If so, return true.  In any case, both module
+/// arguments are deleted.
 ///
-static bool TestOptimizer(BugDriver &BD, Module *Test, Module *Safe,
-                          std::string &Error) {
+static bool TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
+                          std::unique_ptr<Module> Safe, std::string &Error) {
   // Run the optimization passes on ToOptimize, producing a transformed version
   // of the functions being tested.
   outs() << "  Optimizing functions being tested: ";
-  std::unique_ptr<Module> Optimized = BD.runPassesOn(Test, BD.getPassesToRun(),
-                                                     /*AutoDebugCrashes*/ true);
+  std::unique_ptr<Module> Optimized =
+      BD.runPassesOn(Test.get(), BD.getPassesToRun(),
+                     /*AutoDebugCrashes*/ true);
   outs() << "done.\n";
-  delete Test;
 
   outs() << "  Checking to see if the merged program executes correctly: ";
   bool Broken;
-  Module *New =
-      TestMergedProgram(BD, Optimized.get(), Safe, true, Error, Broken);
+  std::unique_ptr<Module> New = testMergedProgram(
+      BD, std::move(Optimized), std::move(Safe), Error, Broken);
   if (New) {
     outs() << (Broken ? " nope.\n" : " yup.\n");
     // Delete the original and set the new program.
-    delete BD.swapProgramIn(New);
+    delete BD.swapProgramIn(New.release());
   }
   return Broken;
 }
@@ -752,10 +750,10 @@ void BugDriver::debugMiscompilation(std::string *Error) {
   // Output a bunch of bitcode files for the user...
   outs() << "Outputting reduced bitcode files which expose the problem:\n";
   ValueToValueMapTy VMap;
-  Module *ToNotOptimize = CloneModule(getProgram(), VMap);
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
-                                                 MiscompiledFunctions,
-                                                 VMap);
+  Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
+  Module *ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize, MiscompiledFunctions, VMap)
+          .release();
 
   outs() << "  Non-optimized portion: ";
   EmitProgressBitcode(ToNotOptimize, "tonotoptimize", true);
@@ -768,13 +766,13 @@ void BugDriver::debugMiscompilation(std::string *Error) {
   return;
 }
 
-/// CleanupAndPrepareModules - Get the specified modules ready for code
-/// generator testing.
+/// Get the specified modules ready for code generator testing.
 ///
-static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
+static void CleanupAndPrepareModules(BugDriver &BD,
+                                     std::unique_ptr<Module> &Test,
                                      Module *Safe) {
   // Clean up the modules, removing extra cruft that we don't need anymore...
-  Test = BD.performFinalCleanups(Test).release();
+  Test = BD.performFinalCleanups(Test.get());
 
   // If we are executing the JIT, we have several nasty issues to take care of.
   if (!BD.isExecutingJIT()) return;
@@ -787,14 +785,14 @@ static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
       // Rename it
       oldMain->setName("llvm_bugpoint_old_main");
       // Create a NEW `main' function with same type in the test module.
-      Function *newMain = Function::Create(oldMain->getFunctionType(),
-                                           GlobalValue::ExternalLinkage,
-                                           "main", Test);
+      Function *newMain =
+          Function::Create(oldMain->getFunctionType(),
+                           GlobalValue::ExternalLinkage, "main", Test.get());
       // Create an `oldmain' prototype in the test module, which will
       // corresponds to the real main function in the same module.
       Function *oldMainProto = Function::Create(oldMain->getFunctionType(),
                                                 GlobalValue::ExternalLinkage,
-                                                oldMain->getName(), Test);
+                                                oldMain->getName(), Test.get());
       // Set up and remember the argument list for the main function.
       std::vector<Value*> args;
       for (Function::arg_iterator
@@ -930,15 +928,14 @@ static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
   }
 }
 
-
-
-/// TestCodeGenerator - This is the predicate function used to check to see if
-/// the "Test" portion of the program is miscompiled by the code generator under
-/// test.  If so, return true.  In any case, both module arguments are deleted.
+/// This is the predicate function used to check to see if the "Test" portion of
+/// the program is miscompiled by the code generator under test.  If so, return
+/// true.  In any case, both module arguments are deleted.
 ///
-static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
+static bool TestCodeGenerator(BugDriver &BD, std::unique_ptr<Module> Test,
+                              std::unique_ptr<Module> Safe,
                               std::string &Error) {
-  CleanupAndPrepareModules(BD, Test, Safe);
+  CleanupAndPrepareModules(BD, Test, Safe.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -949,12 +946,11 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
            << EC.message() << "\n";
     exit(1);
   }
-  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, Test)) {
+  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, Test.get())) {
     errs() << "Error writing bitcode to `" << TestModuleBC.str()
            << "'\nExiting.";
     exit(1);
   }
-  delete Test;
 
   FileRemover TestModuleBCRemover(TestModuleBC.str(), !SaveTemps);
 
@@ -969,7 +965,7 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
     exit(1);
   }
 
-  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, Safe)) {
+  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, Safe.get())) {
     errs() << "Error writing bitcode to `" << SafeModuleBC
            << "'\nExiting.";
     exit(1);
@@ -980,7 +976,6 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
   std::string SharedObject = BD.compileSharedObject(SafeModuleBC.str(), Error);
   if (!Error.empty())
     return false;
-  delete Safe;
 
   FileRemover SharedObjectRemover(SharedObject, !SaveTemps);
 
@@ -1028,11 +1023,12 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *ToNotCodeGen = CloneModule(getProgram(), VMap);
-  Module *ToCodeGen = SplitFunctionsOutOfModule(ToNotCodeGen, Funcs, VMap);
+  std::unique_ptr<Module> ToNotCodeGen = CloneModule(getProgram(), VMap);
+  std::unique_ptr<Module> ToCodeGen =
+      SplitFunctionsOutOfModule(ToNotCodeGen.get(), Funcs, VMap);
 
   // Condition the modules
-  CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen);
+  CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -1044,12 +1040,11 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
     exit(1);
   }
 
-  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, ToCodeGen)) {
+  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, ToCodeGen.get())) {
     errs() << "Error writing bitcode to `" << TestModuleBC
            << "'\nExiting.";
     exit(1);
   }
-  delete ToCodeGen;
 
   // Make the shared library
   SmallString<128> SafeModuleBC;
@@ -1062,7 +1057,8 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
     exit(1);
   }
 
-  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, ToNotCodeGen)) {
+  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD,
+                         ToNotCodeGen.get())) {
     errs() << "Error writing bitcode to `" << SafeModuleBC
            << "'\nExiting.";
     exit(1);
@@ -1070,7 +1066,6 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
   std::string SharedObject = compileSharedObject(SafeModuleBC.str(), *Error);
   if (!Error->empty())
     return true;
-  delete ToNotCodeGen;
 
   outs() << "You can reproduce the problem with the command line: \n";
   if (isExecutingJIT()) {
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 7ac6f8ed5e3d..e8877a5da894 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -2826,7 +2826,7 @@ void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
   uint64_t OrigLowPc = OrigUnitDie->getAttributeValueAsAddress(
       &OrigUnit, dwarf::DW_AT_low_pc, -1ULL);
   // Ranges addresses are based on the unit's low_pc. Compute the
-  // offset we need to apply to adapt to the the new unit's low_pc.
+  // offset we need to apply to adapt to the new unit's low_pc.
   int64_t UnitPcOffset = 0;
   if (OrigLowPc != -1ULL)
     UnitPcOffset = int64_t(OrigLowPc) - Unit.getLowPc();
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 1bd2f8afb290..e52606b8828b 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -29,16 +29,16 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Linker/Linker.h"
+#include "llvm/Linker/IRMover.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/FunctionIndexObjectFile.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -62,6 +62,17 @@ struct claimed_file {
   void *handle;
   std::vector<ld_plugin_symbol> syms;
 };
+
+struct ResolutionInfo {
+  bool IsLinkonceOdr = true;
+  bool UnnamedAddr = true;
+  GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+  bool CommonInternal = false;
+  bool UseCommon = false;
+  unsigned CommonSize = 0;
+  unsigned CommonAlign = 0;
+  claimed_file *CommonFile = nullptr;
+};
 }
 
 static ld_plugin_status discard_message(int level, const char *format, ...) {
@@ -81,6 +92,7 @@ static ld_plugin_message message = discard_message;
 static Reloc::Model RelocationModel = Reloc::Default;
 static std::string output_name = "";
 static std::list<claimed_file> Modules;
+static StringMap<ResolutionInfo> ResInfo;
 static std::vector<std::string> Cleanup;
 static llvm::TargetOptions TargetOpts;
 
@@ -332,6 +344,18 @@ static void diagnosticHandlerForContext(const DiagnosticInfo &DI,
   diagnosticHandler(DI);
 }
 
+static GlobalValue::VisibilityTypes
+getMinVisibility(GlobalValue::VisibilityTypes A,
+                 GlobalValue::VisibilityTypes B) {
+  if (A == GlobalValue::HiddenVisibility)
+    return A;
+  if (B == GlobalValue::HiddenVisibility)
+    return B;
+  if (A == GlobalValue::ProtectedVisibility)
+    return A;
+  return B;
+}
+
 /// Called by gold to see whether this file is one that our plugin can handle.
 /// We'll try to open it and register all the symbols with add_symbol if
 /// possible.
@@ -411,8 +435,22 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
 
     const GlobalValue *GV = Obj->getSymbolGV(Sym.getRawDataRefImpl());
 
+    ResolutionInfo &Res = ResInfo[sym.name];
+
     sym.visibility = LDPV_DEFAULT;
     if (GV) {
+      Res.UnnamedAddr &= GV->hasUnnamedAddr();
+      Res.IsLinkonceOdr &= GV->hasLinkOnceLinkage();
+      if (GV->hasCommonLinkage()) {
+        Res.CommonAlign = std::max(Res.CommonAlign, GV->getAlignment());
+        const DataLayout &DL = GV->getParent()->getDataLayout();
+        uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+        if (Size >= Res.CommonSize) {
+          Res.CommonSize = Size;
+          Res.CommonFile = &cf;
+        }
+      }
+      Res.Visibility = getMinVisibility(Res.Visibility, GV->getVisibility());
       switch (GV->getVisibility()) {
       case GlobalValue::DefaultVisibility:
         sym.visibility = LDPV_DEFAULT;
@@ -466,27 +504,6 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
   return LDPS_OK;
 }
 
-static void keepGlobalValue(GlobalValue &GV,
-                            std::vector<GlobalAlias *> &KeptAliases) {
-  assert(!GV.hasLocalLinkage());
-
-  if (auto *GA = dyn_cast<GlobalAlias>(&GV))
-    KeptAliases.push_back(GA);
-
-  switch (GV.getLinkage()) {
-  default:
-    break;
-  case GlobalValue::LinkOnceAnyLinkage:
-    GV.setLinkage(GlobalValue::WeakAnyLinkage);
-    break;
-  case GlobalValue::LinkOnceODRLinkage:
-    GV.setLinkage(GlobalValue::WeakODRLinkage);
-    break;
-  }
-
-  assert(!GV.isDiscardableIfUnused());
-}
-
 static void internalize(GlobalValue &GV) {
   if (GV.isDeclarationForLinker())
     return; // We get here if there is a matching asm definition.
@@ -494,33 +511,6 @@ static void internalize(GlobalValue &GV) {
     GV.setLinkage(GlobalValue::InternalLinkage);
 }
 
-static void drop(GlobalValue &GV) {
-  if (auto *F = dyn_cast<Function>(&GV)) {
-    F->deleteBody();
-    F->setComdat(nullptr); // Should deleteBody do this?
-    return;
-  }
-
-  if (auto *Var = dyn_cast<GlobalVariable>(&GV)) {
-    Var->setInitializer(nullptr);
-    Var->setLinkage(
-        GlobalValue::ExternalLinkage); // Should setInitializer do this?
-    Var->setComdat(nullptr); // and this?
-    return;
-  }
-
-  auto &Alias = cast<GlobalAlias>(GV);
-  Module &M = *Alias.getParent();
-  PointerType &Ty = *cast<PointerType>(Alias.getType());
-  GlobalValue::LinkageTypes L = Alias.getLinkage();
-  auto *Var =
-      new GlobalVariable(M, Ty.getElementType(), /*isConstant*/ false, L,
-                         /*Initializer*/ nullptr);
-  Var->takeName(&Alias);
-  Alias.replaceAllUsesWith(Var);
-  Alias.eraseFromParent();
-}
-
 static const char *getResolutionName(ld_plugin_symbol_resolution R) {
   switch (R) {
   case LDPR_UNKNOWN:
@@ -547,58 +537,6 @@ static const char *getResolutionName(ld_plugin_symbol_resolution R) {
   llvm_unreachable("Unknown resolution");
 }
 
-namespace {
-class LocalValueMaterializer final : public ValueMaterializer {
-  DenseSet<GlobalValue *> &Dropped;
-  DenseMap<GlobalObject *, GlobalObject *> LocalVersions;
-
-public:
-  LocalValueMaterializer(DenseSet<GlobalValue *> &Dropped) : Dropped(Dropped) {}
-  Value *materializeDeclFor(Value *V) override;
-};
-}
-
-Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
-  auto *GO = dyn_cast<GlobalObject>(V);
-  if (!GO)
-    return nullptr;
-
-  auto I = LocalVersions.find(GO);
-  if (I != LocalVersions.end())
-    return I->second;
-
-  if (!Dropped.count(GO))
-    return nullptr;
-
-  Module &M = *GO->getParent();
-  GlobalValue::LinkageTypes L = GO->getLinkage();
-  GlobalObject *Declaration;
-  if (auto *F = dyn_cast<Function>(GO)) {
-    Declaration = Function::Create(F->getFunctionType(), L, "", &M);
-  } else {
-    auto *Var = cast<GlobalVariable>(GO);
-    Declaration = new GlobalVariable(M, Var->getType()->getElementType(),
-                                     Var->isConstant(), L,
-                                     /*Initializer*/ nullptr);
-  }
-  Declaration->takeName(GO);
-  Declaration->copyAttributesFrom(GO);
-
-  GO->setLinkage(GlobalValue::InternalLinkage);
-  GO->setName(Declaration->getName());
-  Dropped.erase(GO);
-  GO->replaceAllUsesWith(Declaration);
-
-  LocalVersions[Declaration] = GO;
-
-  return GO;
-}
-
-static Constant *mapConstantToLocalCopy(Constant *C, ValueToValueMapTy &VM,
-                                        LocalValueMaterializer *Materializer) {
-  return MapValue(C, VM, RF_IgnoreMissingEntries, nullptr, Materializer);
-}
-
 static void freeSymName(ld_plugin_symbol &Sym) {
   free(Sym.name);
   free(Sym.comdat_key);
@@ -640,7 +578,8 @@ getFunctionIndexForFile(claimed_file &F, ld_plugin_input_file &Info) {
 static std::unique_ptr<Module>
 getModuleForFile(LLVMContext &Context, claimed_file &F,
                  ld_plugin_input_file &Info, raw_fd_ostream *ApiFile,
-                 StringSet<> &Internalize, StringSet<> &Maybe) {
+                 StringSet<> &Internalize, StringSet<> &Maybe,
+                 std::vector<GlobalValue *> &Keep) {
 
   if (get_symbols(F.handle, F.syms.size(), F.syms.data()) != LDPS_OK)
     message(LDPL_FATAL, "Failed to get symbol information");
@@ -668,11 +607,12 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
   SmallPtrSet<GlobalValue *, 8> Used;
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
 
-  DenseSet<GlobalValue *> Drop;
-  std::vector<GlobalAlias *> KeptAliases;
-
   unsigned SymNum = 0;
   for (auto &ObjSym : Obj.symbols()) {
+    GlobalValue *GV = Obj.getSymbolGV(ObjSym.getRawDataRefImpl());
+    if (GV && GV->hasAppendingLinkage())
+      Keep.push_back(GV);
+
     if (shouldSkip(ObjSym.getFlags()))
       continue;
     ld_plugin_symbol &Sym = F.syms[SymNum];
@@ -684,20 +624,37 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     if (options::generate_api_file)
       *ApiFile << Sym.name << ' ' << getResolutionName(Resolution) << '\n';
 
-    GlobalValue *GV = Obj.getSymbolGV(ObjSym.getRawDataRefImpl());
     if (!GV) {
       freeSymName(Sym);
       continue; // Asm symbol.
     }
 
-    if (Resolution != LDPR_PREVAILING_DEF_IRONLY && GV->hasCommonLinkage()) {
-      // Common linkage is special. There is no single symbol that wins the
-      // resolution. Instead we have to collect the maximum alignment and size.
-      // The IR linker does that for us if we just pass it every common GV.
-      // We still have to keep track of LDPR_PREVAILING_DEF_IRONLY so we
-      // internalize once the IR linker has done its job.
-      freeSymName(Sym);
-      continue;
+    ResolutionInfo &Res = ResInfo[Sym.name];
+    if (Resolution == LDPR_PREVAILING_DEF_IRONLY_EXP && !Res.IsLinkonceOdr)
+      Resolution = LDPR_PREVAILING_DEF;
+
+    GV->setUnnamedAddr(Res.UnnamedAddr);
+    GV->setVisibility(Res.Visibility);
+
+    // Override gold's resolution for common symbols. We want the largest
+    // one to win.
+    if (GV->hasCommonLinkage()) {
+      cast<GlobalVariable>(GV)->setAlignment(Res.CommonAlign);
+      if (Resolution == LDPR_PREVAILING_DEF_IRONLY)
+        Res.CommonInternal = true;
+
+      if (Resolution == LDPR_PREVAILING_DEF_IRONLY ||
+          Resolution == LDPR_PREVAILING_DEF)
+        Res.UseCommon = true;
+
+      if (Res.CommonFile == &F && Res.UseCommon) {
+        if (Res.CommonInternal)
+          Resolution = LDPR_PREVAILING_DEF_IRONLY;
+        else
+          Resolution = LDPR_PREVAILING_DEF;
+      } else {
+        Resolution = LDPR_PREEMPTED_IR;
+      }
     }
 
     switch (Resolution) {
@@ -707,40 +664,37 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     case LDPR_RESOLVED_IR:
     case LDPR_RESOLVED_EXEC:
     case LDPR_RESOLVED_DYN:
-      assert(GV->isDeclarationForLinker());
+    case LDPR_PREEMPTED_IR:
+    case LDPR_PREEMPTED_REG:
       break;
 
     case LDPR_UNDEF:
-      if (!GV->isDeclarationForLinker()) {
+      if (!GV->isDeclarationForLinker())
         assert(GV->hasComdat());
-        Drop.insert(GV);
-      }
       break;
 
     case LDPR_PREVAILING_DEF_IRONLY: {
-      keepGlobalValue(*GV, KeptAliases);
-      if (!Used.count(GV)) {
-        // Since we use the regular lib/Linker, we cannot just internalize GV
-        // now or it will not be copied to the merged module. Instead we force
-        // it to be copied and then internalize it.
+      Keep.push_back(GV);
+      // The IR linker has to be able to map this value to a declaration,
+      // so we can only internalize after linking.
+      if (!Used.count(GV))
         Internalize.insert(GV->getName());
-      }
       break;
     }
 
     case LDPR_PREVAILING_DEF:
-      keepGlobalValue(*GV, KeptAliases);
-      break;
-
-    case LDPR_PREEMPTED_IR:
-      // Gold might have selected a linkonce_odr and preempted a weak_odr.
-      // In that case we have to make sure we don't end up internalizing it.
-      if (!GV->isDiscardableIfUnused())
-        Maybe.erase(GV->getName());
-
-      // fall-through
-    case LDPR_PREEMPTED_REG:
-      Drop.insert(GV);
+      Keep.push_back(GV);
+      // There is a non IR use, so we have to force optimizations to keep this.
+      switch (GV->getLinkage()) {
+      default:
+        break;
+      case GlobalValue::LinkOnceAnyLinkage:
+        GV->setLinkage(GlobalValue::WeakAnyLinkage);
+        break;
+      case GlobalValue::LinkOnceODRLinkage:
+        GV->setLinkage(GlobalValue::WeakODRLinkage);
+        break;
+      }
       break;
 
     case LDPR_PREVAILING_DEF_IRONLY_EXP: {
@@ -748,9 +702,8 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
       // reason is that this GV might have a copy in another module
       // and in that module the address might be significant, but that
       // copy will be LDPR_PREEMPTED_IR.
-      if (GV->hasLinkOnceODRLinkage())
-        Maybe.insert(GV->getName());
-      keepGlobalValue(*GV, KeptAliases);
+      Maybe.insert(GV->getName());
+      Keep.push_back(GV);
       break;
     }
     }
@@ -758,19 +711,6 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     freeSymName(Sym);
   }
 
-  ValueToValueMapTy VM;
-  LocalValueMaterializer Materializer(Drop);
-  for (GlobalAlias *GA : KeptAliases) {
-    // Gold told us to keep GA. It is possible that a GV usied in the aliasee
-    // expression is being dropped. If that is the case, that GV must be copied.
-    Constant *Aliasee = GA->getAliasee();
-    Constant *Replacement = mapConstantToLocalCopy(Aliasee, VM, &Materializer);
-    GA->setAliasee(Replacement);
-  }
-
-  for (auto *GV : Drop)
-    drop(*GV);
-
   return Obj.takeModule();
 }
 
@@ -900,9 +840,6 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   if (Modules.empty())
     return LDPS_OK;
 
-  LLVMContext Context;
-  Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
-
   // If we are doing ThinLTO compilation, simply build the combined
   // function index/summary and emit it. We don't need to parse the modules
   // and link them in this case.
@@ -918,10 +855,11 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
           getFunctionIndexForFile(F, File);
 
       // Skip files without a function summary.
-      if (!Index)
-        continue;
+      if (Index)
+        CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId);
 
-      CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId);
+      if (release_input_file(F.handle) != LDPS_OK)
+        message(LDPL_FATAL, "Failed to release file information");
     }
 
     std::error_code EC;
@@ -937,8 +875,11 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
     exit(0);
   }
 
+  LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
+
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
-  Linker L(*Combined);
+  IRMover L(*Combined, diagnosticHandler);
 
   std::string DefaultTriple = sys::getDefaultTargetTriple();
 
@@ -948,15 +889,15 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
     ld_plugin_input_file File;
     if (get_input_file(F.handle, &File) != LDPS_OK)
       message(LDPL_FATAL, "Failed to get file information");
+    std::vector<GlobalValue *> Keep;
     std::unique_ptr<Module> M =
-        getModuleForFile(Context, F, File, ApiFile, Internalize, Maybe);
+        getModuleForFile(Context, F, File, ApiFile, Internalize, Maybe, Keep);
     if (!options::triple.empty())
       M->setTargetTriple(options::triple.c_str());
-    else if (M->getTargetTriple().empty()) {
+    else if (M->getTargetTriple().empty())
       M->setTargetTriple(DefaultTriple);
-    }
 
-    if (L.linkInModule(*M))
+    if (L.move(*M, Keep, [](GlobalValue &, IRMover::ValueAdder) {}))
       message(LDPL_FATAL, "Failed to link module");
     if (release_input_file(F.handle) != LDPS_OK)
       message(LDPL_FATAL, "Failed to release file information");
diff --git a/tools/llc/CMakeLists.txt b/tools/llc/CMakeLists.txt
index ff5a89e1da44..136cf4be1c12 100644
--- a/tools/llc/CMakeLists.txt
+++ b/tools/llc/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   SelectionDAG
   Support
   Target
+  TransformUtils
   )
 
 # Support plugins.
diff --git a/tools/llc/LLVMBuild.txt b/tools/llc/LLVMBuild.txt
index 38660cf27a46..c1f5cebea859 100644
--- a/tools/llc/LLVMBuild.txt
+++ b/tools/llc/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llc
 parent = Tools
-required_libraries = AsmParser BitReader IRReader MIRParser all-targets
+required_libraries = AsmParser BitReader IRReader MIRParser TransformUtils all-targets
diff --git a/tools/llc/Makefile b/tools/llc/Makefile
index ae64c9a5b57c..cd34c80d840a 100644
--- a/tools/llc/Makefile
+++ b/tools/llc/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llc
-LINK_COMPONENTS := all-targets bitreader asmparser irreader mirparser
+LINK_COMPONENTS := all-targets bitreader asmparser irreader mirparser transformutils
 
 # Support plugins.
 NO_DEAD_STRIP := 1
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index c51c012391b2..bffa39fd9e5c 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <memory>
 using namespace llvm;
 
@@ -96,6 +97,12 @@ static cl::opt<bool> AsmVerbose("asm-verbose",
                                 cl::desc("Add comments to directives."),
                                 cl::init(true));
 
+static cl::opt<bool>
+    CompileTwice("compile-twice", cl::Hidden,
+                 cl::desc("Run everything twice, re-using the same pass "
+                          "manager and verify the result is the same."),
+                 cl::init(false));
+
 static int compileModule(char **, LLVMContext &);
 
 static std::unique_ptr<tool_output_file>
@@ -325,10 +332,15 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   {
     raw_pwrite_stream *OS = &Out->os();
-    std::unique_ptr<buffer_ostream> BOS;
-    if (FileType != TargetMachine::CGFT_AssemblyFile &&
-        !Out->os().supportsSeeking()) {
-      BOS = make_unique<buffer_ostream>(*OS);
+
+    // Manually do the buffering rather than using buffer_ostream,
+    // so we can memcmp the contents in CompileTwice mode
+    SmallVector<char, 0> Buffer;
+    std::unique_ptr<raw_svector_ostream> BOS;
+    if ((FileType != TargetMachine::CGFT_AssemblyFile &&
+         !Out->os().supportsSeeking()) ||
+        CompileTwice) {
+      BOS = make_unique<raw_svector_ostream>(Buffer);
       OS = BOS.get();
     }
 
@@ -378,7 +390,39 @@ static int compileModule(char **argv, LLVMContext &Context) {
     // Before executing passes, print the final values of the LLVM options.
     cl::PrintOptionValues();
 
+    // If requested, run the pass manager over the same module again,
+    // to catch any bugs due to persistent state in the passes. Note that
+    // opt has the same functionality, so it may be worth abstracting this out
+    // in the future.
+    SmallVector<char, 0> CompileTwiceBuffer;
+    if (CompileTwice) {
+      std::unique_ptr<Module> M2(llvm::CloneModule(M.get()));
+      PM.run(*M2);
+      CompileTwiceBuffer = Buffer;
+      Buffer.clear();
+    }
+
     PM.run(*M);
+
+    // Compare the two outputs and make sure they're the same
+    if (CompileTwice) {
+      if (Buffer.size() != CompileTwiceBuffer.size() ||
+          (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
+           0)) {
+        errs()
+            << "Running the pass manager twice changed the output.\n"
+               "Writing the result of the second run to the specified output\n"
+               "To generate the one-run comparison binary, just run without\n"
+               "the compile-twice option\n";
+        Out->os() << Buffer;
+        Out->keep();
+        return 1;
+      }
+    }
+
+    if (BOS) {
+      Out->os() << Buffer;
+    }
   }
 
   // Declare success.
diff --git a/tools/lli/OrcLazyJIT.cpp b/tools/lli/OrcLazyJIT.cpp
index 7d79c48559ef..4235145ee7a5 100644
--- a/tools/lli/OrcLazyJIT.cpp
+++ b/tools/lli/OrcLazyJIT.cpp
@@ -52,7 +52,7 @@ OrcLazyJIT::createCompileCallbackMgr(Triple T) {
     default: return nullptr;
 
     case Triple::x86_64: {
-      typedef orc::JITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
       return llvm::make_unique<CCMgrT>(0);
     }
   }
@@ -65,7 +65,8 @@ OrcLazyJIT::createIndirectStubsMgrBuilder(Triple T) {
 
     case Triple::x86_64:
       return [](){
-        return llvm::make_unique<orc::IndirectStubsManager<orc::OrcX86_64>>();
+        return llvm::make_unique<
+                       orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
       };
   }
 }
diff --git a/tools/lli/OrcLazyJIT.h b/tools/lli/OrcLazyJIT.h
index ec86a72efaa0..bb4da33ea9b6 100644
--- a/tools/lli/OrcLazyJIT.h
+++ b/tools/lli/OrcLazyJIT.h
@@ -29,7 +29,7 @@ namespace llvm {
 class OrcLazyJIT {
 public:
 
-  typedef orc::JITCompileCallbackManagerBase CompileCallbackMgr;
+  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
   typedef orc::ObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index e6a90cf8a3cf..5d95a751f715 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -1,6 +1,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -10,16 +12,16 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Options.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Options.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include <memory>
 #include <list>
+#include <memory>
 #include <unordered_set>
 
 using namespace llvm;
@@ -30,8 +32,10 @@ OptionCategory DwpCategory("Specific Options");
 static list<std::string> InputFiles(Positional, OneOrMore,
                                     desc("<input files>"), cat(DwpCategory));
 
-static opt<std::string> OutputFilename(Required, "o", desc("Specify the output file."),
-                                      value_desc("filename"), cat(DwpCategory));
+static opt<std::string> OutputFilename(Required, "o",
+                                       desc("Specify the output file."),
+                                       value_desc("filename"),
+                                       cat(DwpCategory));
 
 static int error(const Twine &Error, const Twine &Context) {
   errs() << Twine("while processing ") + Context + ":\n";
@@ -82,29 +86,167 @@ writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
   return std::error_code();
 }
 
+static uint32_t getCUAbbrev(StringRef Abbrev, uint64_t AbbrCode) {
+  uint64_t CurCode;
+  uint32_t Offset = 0;
+  DataExtractor AbbrevData(Abbrev, true, 0);
+  while ((CurCode = AbbrevData.getULEB128(&Offset)) != AbbrCode) {
+    // Tag
+    AbbrevData.getULEB128(&Offset);
+    // DW_CHILDREN
+    AbbrevData.getU8(&Offset);
+    // Attributes
+    while (AbbrevData.getULEB128(&Offset) | AbbrevData.getULEB128(&Offset))
+      ;
+  }
+  return Offset;
+}
+
+static uint64_t getCUSignature(StringRef Abbrev, StringRef Info) {
+  uint32_t Offset = 0;
+  DataExtractor InfoData(Info, true, 0);
+  InfoData.getU32(&Offset); // Length
+  uint16_t Version = InfoData.getU16(&Offset);
+  InfoData.getU32(&Offset); // Abbrev offset (should be zero)
+  uint8_t AddrSize = InfoData.getU8(&Offset);
+
+  uint32_t AbbrCode = InfoData.getULEB128(&Offset);
+
+  DataExtractor AbbrevData(Abbrev, true, 0);
+  uint32_t AbbrevOffset = getCUAbbrev(Abbrev, AbbrCode);
+  uint64_t Tag = AbbrevData.getULEB128(&AbbrevOffset);
+  (void)Tag;
+  // FIXME: Real error handling
+  assert(Tag == dwarf::DW_TAG_compile_unit);
+  // DW_CHILDREN
+  AbbrevData.getU8(&AbbrevOffset);
+  uint32_t Name;
+  uint32_t Form;
+  while ((Name = AbbrevData.getULEB128(&AbbrevOffset)) |
+             (Form = AbbrevData.getULEB128(&AbbrevOffset)) &&
+         Name != dwarf::DW_AT_GNU_dwo_id) {
+    DWARFFormValue::skipValue(Form, InfoData, &Offset, Version, AddrSize);
+  }
+  // FIXME: Real error handling
+  assert(Name == dwarf::DW_AT_GNU_dwo_id);
+  return InfoData.getU64(&Offset);
+}
+
+struct UnitIndexEntry {
+  uint64_t Signature;
+  DWARFUnitIndex::Entry::SectionContribution Contributions[8];
+};
+
+static void addAllTypes(MCStreamer &Out,
+                        std::vector<UnitIndexEntry> &TypeIndexEntries,
+                        MCSection *OutputTypes, StringRef Types,
+                        const UnitIndexEntry &CUEntry, uint32_t &TypesOffset) {
+  if (Types.empty())
+    return;
+
+  Out.SwitchSection(OutputTypes);
+  uint32_t Offset = 0;
+  DataExtractor Data(Types, true, 0);
+  while (Data.isValidOffset(Offset)) {
+    TypeIndexEntries.push_back(CUEntry);
+    auto &Entry = TypeIndexEntries.back();
+    // Zero out the debug_info contribution
+    Entry.Contributions[0] = {};
+    auto &C = Entry.Contributions[DW_SECT_TYPES - DW_SECT_INFO];
+    C.Offset = TypesOffset + Offset;
+    auto PrevOffset = Offset;
+    // Length of the unit, including the 4 byte length field.
+    C.Length = Data.getU32(&Offset) + 4;
+
+    Out.EmitBytes(Types.substr(Offset - 4, C.Length));
+    TypesOffset += C.Length;
+
+    Data.getU16(&Offset); // Version
+    Data.getU32(&Offset); // Abbrev offset
+    Data.getU8(&Offset);  // Address size
+    Entry.Signature = Data.getU64(&Offset);
+    Offset = PrevOffset + C.Length;
+  }
+}
+
+static void
+writeIndexTable(MCStreamer &Out, ArrayRef<unsigned> ContributionOffsets,
+                ArrayRef<UnitIndexEntry> IndexEntries,
+                uint32_t DWARFUnitIndex::Entry::SectionContribution::*Field) {
+  for (const auto &E : IndexEntries)
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].*Field, 4);
+}
+
+static void writeIndex(MCStreamer &Out, MCSection *Section,
+                       ArrayRef<unsigned> ContributionOffsets,
+                       ArrayRef<UnitIndexEntry> IndexEntries) {
+  unsigned Columns = 0;
+  for (auto &C : ContributionOffsets)
+    if (C)
+      ++Columns;
+
+  std::vector<unsigned> Buckets(NextPowerOf2(3 * IndexEntries.size() / 2));
+  uint64_t Mask = Buckets.size() - 1;
+  for (size_t i = 0; i != IndexEntries.size(); ++i) {
+    auto S = IndexEntries[i].Signature;
+    auto H = S & Mask;
+    while (Buckets[H]) {
+      assert(S != IndexEntries[Buckets[H] - 1].Signature &&
+             "Duplicate type unit");
+      H += ((S >> 32) & Mask) | 1;
+    }
+    Buckets[H] = i + 1;
+  }
+
+  Out.SwitchSection(Section);
+  Out.EmitIntValue(2, 4);                   // Version
+  Out.EmitIntValue(Columns, 4);             // Columns
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
+  Out.EmitIntValue(Buckets.size(), 4);      // Num Buckets
+
+  // Write the signatures.
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I ? IndexEntries[I - 1].Signature : 0, 8);
+
+  // Write the indexes.
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I, 4);
+
+  // Write the column headers (which sections will appear in the table)
+  for (size_t i = 0; i != ContributionOffsets.size(); ++i)
+    if (ContributionOffsets[i])
+      Out.EmitIntValue(i + DW_SECT_INFO, 4);
+
+  // Write the offsets.
+  writeIndexTable(Out, ContributionOffsets, IndexEntries,
+                  &DWARFUnitIndex::Entry::SectionContribution::Offset);
+
+  // Write the lengths.
+  writeIndexTable(Out, ContributionOffsets, IndexEntries,
+                  &DWARFUnitIndex::Entry::SectionContribution::Length);
+}
 static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   const auto &MCOFI = *Out.getContext().getObjectFileInfo();
   MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
   MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection();
+  MCSection *const TypesSection = MCOFI.getDwarfTypesDWOSection();
   const StringMap<std::pair<MCSection *, DWARFSectionKind>> KnownSections = {
       {"debug_info.dwo", {MCOFI.getDwarfInfoDWOSection(), DW_SECT_INFO}},
       {"debug_types.dwo", {MCOFI.getDwarfTypesDWOSection(), DW_SECT_TYPES}},
       {"debug_str_offsets.dwo", {StrOffsetSection, DW_SECT_STR_OFFSETS}},
       {"debug_str.dwo", {StrSection, static_cast<DWARFSectionKind>(0)}},
       {"debug_loc.dwo", {MCOFI.getDwarfLocDWOSection(), DW_SECT_LOC}},
+      {"debug_line.dwo", {MCOFI.getDwarfLineDWOSection(), DW_SECT_LINE}},
       {"debug_abbrev.dwo", {MCOFI.getDwarfAbbrevDWOSection(), DW_SECT_ABBREV}}};
 
-  struct UnitIndexEntry {
-    uint64_t Signature;
-    DWARFUnitIndex::Entry::SectionContribution Contributions[8];
-  };
-
   std::vector<UnitIndexEntry> IndexEntries;
+  std::vector<UnitIndexEntry> TypeIndexEntries;
 
   StringMap<uint32_t> Strings;
   uint32_t StringOffset = 0;
 
-  uint64_t UnitIndex = 0;
   uint32_t ContributionOffsets[8] = {};
 
   for (const auto &Input : Inputs) {
@@ -114,10 +256,12 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
 
     IndexEntries.emplace_back();
     UnitIndexEntry &CurEntry = IndexEntries.back();
-    CurEntry.Signature = UnitIndex++;
 
     StringRef CurStrSection;
     StringRef CurStrOffsetSection;
+    StringRef CurTypesSection;
+    StringRef InfoSection;
+    StringRef AbbrevSection;
 
     for (const auto &Section : ErrOrObj->getBinary()->sections()) {
       StringRef Name;
@@ -135,9 +279,22 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
 
       if (DWARFSectionKind Kind = SectionPair->second.second) {
         auto Index = Kind - DW_SECT_INFO;
-        CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
-        ContributionOffsets[Index] +=
-            (CurEntry.Contributions[Index].Length = Contents.size());
+        if (Kind != DW_SECT_TYPES) {
+          CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
+          ContributionOffsets[Index] +=
+              (CurEntry.Contributions[Index].Length = Contents.size());
+        }
+
+        switch (Kind) {
+        case DW_SECT_INFO:
+          InfoSection = Contents;
+          break;
+        case DW_SECT_ABBREV:
+          AbbrevSection = Contents;
+          break;
+        default:
+          break;
+        }
       }
 
       MCSection *OutSection = SectionPair->second.first;
@@ -145,59 +302,46 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         CurStrOffsetSection = Contents;
       else if (OutSection == StrSection)
         CurStrSection = Contents;
+      else if (OutSection == TypesSection)
+        CurTypesSection = Contents;
       else {
         Out.SwitchSection(OutSection);
         Out.EmitBytes(Contents);
       }
     }
 
+    assert(!AbbrevSection.empty());
+    assert(!InfoSection.empty());
+    CurEntry.Signature = getCUSignature(AbbrevSection, InfoSection);
+    addAllTypes(Out, TypeIndexEntries, TypesSection, CurTypesSection, CurEntry,
+                ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO]);
+
     if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
                                           StrSection, StrOffsetSection,
                                           CurStrSection, CurStrOffsetSection))
       return Err;
   }
 
-  unsigned Columns = 0;
-  for (auto &C : ContributionOffsets)
-    if (C)
-      ++Columns;
-
-  Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
-  Out.EmitIntValue(2, 4);                   // Version
-  Out.EmitIntValue(Columns, 4);             // Columns
-  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
-  // FIXME: This is not the right number of buckets for a real hash.
-  Out.EmitIntValue(IndexEntries.size(), 4); // Num Buckets
-
-  // Write the signatures.
-  for (const auto &E : IndexEntries)
-    Out.EmitIntValue(E.Signature, 8);
-
-  // Write the indexes.
-  for (size_t i = 0; i != IndexEntries.size(); ++i)
-    Out.EmitIntValue(i + 1, 4);
-
-  // Write the column headers (which sections will appear in the table)
-  for (size_t i = 0; i != array_lengthof(ContributionOffsets); ++i)
-    if (ContributionOffsets[i])
-      Out.EmitIntValue(i + DW_SECT_INFO, 4);
+  if (!TypeIndexEntries.empty()) {
+    // Lie about there being no info contributions so the TU index only includes
+    // the type unit contribution
+    ContributionOffsets[0] = 0;
+    writeIndex(Out, MCOFI.getDwarfTUIndexSection(), ContributionOffsets,
+               TypeIndexEntries);
+  }
 
-  // Write the offsets.
-  for (const auto &E : IndexEntries)
-    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
-      if (ContributionOffsets[i])
-        Out.EmitIntValue(E.Contributions[i].Offset, 4);
+  // Lie about the type contribution
+  ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO] = 0;
+  // Unlie about the info contribution
+  ContributionOffsets[0] = 1;
 
-  // Write the lengths.
-  for (const auto &E : IndexEntries)
-    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
-      if (ContributionOffsets[i])
-        Out.EmitIntValue(E.Contributions[i].Length, 4);
+  writeIndex(Out, MCOFI.getDwarfCUIndexSection(), ContributionOffsets,
+             IndexEntries);
 
   return std::error_code();
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
 
   ParseCommandLineOptions(argc, argv, "merge split dwarf (.dwo) files");
 
@@ -229,8 +373,7 @@ int main(int argc, char** argv) {
 
   MCObjectFileInfo MOFI;
   MCContext MC(MAI.get(), MRI.get(), &MOFI);
-  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default,
-                             MC);
+  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default, MC);
 
   auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "");
   if (!MAB)
diff --git a/tools/llvm-lto/Makefile b/tools/llvm-lto/Makefile
index f1801b4b20cc..f8ca7e1cac5d 100644
--- a/tools/llvm-lto/Makefile
+++ b/tools/llvm-lto/Makefile
@@ -14,6 +14,4 @@ LINK_COMPONENTS := lto ipo scalaropts linker bitreader bitwriter mcdisassembler
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
 
-NO_INSTALL := 1
-
 include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 64e0ae31d06a..4bc692279b9e 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -124,23 +124,27 @@ static void handleDiagnostics(lto_codegen_diagnostic_severity_t Severity,
   errs() << Msg << "\n";
 }
 
+static std::string CurrentActivity;
 static void diagnosticHandler(const DiagnosticInfo &DI) {
   raw_ostream &OS = errs();
   OS << "llvm-lto: ";
   switch (DI.getSeverity()) {
   case DS_Error:
-    OS << "error: ";
+    OS << "error";
     break;
   case DS_Warning:
-    OS << "warning: ";
+    OS << "warning";
     break;
   case DS_Remark:
-    OS << "remark: ";
+    OS << "remark";
     break;
   case DS_Note:
-    OS << "note: ";
+    OS << "note";
     break;
   }
+  if (!CurrentActivity.empty())
+    OS << ' ' << CurrentActivity;
+  OS << ": ";
 
   DiagnosticPrinterRawOStream DP(OS);
   DI.print(DP);
@@ -150,18 +154,38 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
     exit(1);
 }
 
+static void diagnosticHandlerWithContenxt(const DiagnosticInfo &DI,
+                                          void *Context) {
+  diagnosticHandler(DI);
+}
+
+static void error(const Twine &Msg) {
+  errs() << "llvm-lto: " << Msg << '\n';
+  exit(1);
+}
+
+static void error(std::error_code EC, const Twine &Prefix) {
+  if (EC)
+    error(Prefix + ": " + EC.message());
+}
+
+template <typename T>
+static void error(const ErrorOr<T> &V, const Twine &Prefix) {
+  error(V.getError(), Prefix);
+}
+
 static std::unique_ptr<LTOModule>
 getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
-                  const TargetOptions &Options, std::string &Error) {
+                  const TargetOptions &Options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(Path);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    Error = EC.message();
-    return nullptr;
-  }
+  error(BufferOrErr, "error loading file '" + Path + "'");
   Buffer = std::move(BufferOrErr.get());
-  return std::unique_ptr<LTOModule>(LTOModule::createInLocalContext(
-      Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Error, Path));
+  CurrentActivity = ("loading file '" + Path + "'").str();
+  ErrorOr<std::unique_ptr<LTOModule>> Ret = LTOModule::createInLocalContext(
+      Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Path);
+  CurrentActivity = "";
+  return std::move(*Ret);
 }
 
 /// \brief List symbols in each IR file.
@@ -170,43 +194,32 @@ getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
 /// functionality that's exposed by the C API to list symbols.  Moreover, this
 /// provides testing coverage for modules that have been created in their own
 /// contexts.
-static int listSymbols(StringRef Command, const TargetOptions &Options) {
+static void listSymbols(const TargetOptions &Options) {
   for (auto &Filename : InputFilenames) {
-    std::string Error;
     std::unique_ptr<MemoryBuffer> Buffer;
     std::unique_ptr<LTOModule> Module =
-        getLocalLTOModule(Filename, Buffer, Options, Error);
-    if (!Module) {
-      errs() << Command << ": error loading file '" << Filename
-             << "': " << Error << "\n";
-      return 1;
-    }
+        getLocalLTOModule(Filename, Buffer, Options);
 
     // List the symbols.
     outs() << Filename << ":\n";
     for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
       outs() << Module->getSymbolName(I) << "\n";
   }
-  return 0;
 }
 
 /// Create a combined index file from the input IR files and write it.
 ///
 /// This is meant to enable testing of ThinLTO combined index generation,
 /// currently available via the gold plugin via -thinlto.
-static int createCombinedFunctionIndex(StringRef Command) {
+static void createCombinedFunctionIndex() {
   FunctionInfoIndex CombinedIndex;
   uint64_t NextModuleId = 0;
   for (auto &Filename : InputFilenames) {
+    CurrentActivity = "loading file '" + Filename + "'";
     ErrorOr<std::unique_ptr<FunctionInfoIndex>> IndexOrErr =
         llvm::getFunctionIndexForFile(Filename, diagnosticHandler);
-    if (std::error_code EC = IndexOrErr.getError()) {
-      std::string Error = EC.message();
-      errs() << Command << ": error loading file '" << Filename
-             << "': " << Error << "\n";
-      return 1;
-    }
     std::unique_ptr<FunctionInfoIndex> Index = std::move(IndexOrErr.get());
+    CurrentActivity = "";
     // Skip files without a function summary.
     if (!Index)
       continue;
@@ -216,14 +229,9 @@ static int createCombinedFunctionIndex(StringRef Command) {
   assert(!OutputFilename.empty());
   raw_fd_ostream OS(OutputFilename + ".thinlto.bc", EC,
                     sys::fs::OpenFlags::F_None);
-  if (EC) {
-    errs() << Command << ": error opening the file '" << OutputFilename
-           << ".thinlto.bc': " << EC.message() << "\n";
-    return 1;
-  }
+  error(EC, "error opening the file '" + OutputFilename + ".thinlto.bc'");
   WriteFunctionSummaryToFile(CombinedIndex, OS);
   OS.close();
-  return 0;
 }
 
 int main(int argc, char **argv) {
@@ -234,10 +242,8 @@ int main(int argc, char **argv) {
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
 
-  if (OptLevel < '0' || OptLevel > '3') {
-    errs() << argv[0] << ": optimization level must be between 0 and 3\n";
-    return 1;
-  }
+  if (OptLevel < '0' || OptLevel > '3')
+    error("optimization level must be between 0 and 3");
 
   // Initialize the configured targets.
   InitializeAllTargets();
@@ -248,15 +254,22 @@ int main(int argc, char **argv) {
   // set up the TargetOptions for the machine
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
 
-  if (ListSymbolsOnly)
-    return listSymbols(argv[0], Options);
+  if (ListSymbolsOnly) {
+    listSymbols(Options);
+    return 0;
+  }
 
-  if (ThinLTO)
-    return createCombinedFunctionIndex(argv[0]);
+  if (ThinLTO) {
+    createCombinedFunctionIndex();
+    return 0;
+  }
 
   unsigned BaseArg = 0;
 
-  LTOCodeGenerator CodeGen;
+  LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandlerWithContenxt, nullptr, true);
+
+  LTOCodeGenerator CodeGen(Context);
 
   if (UseDiagnosticHandler)
     CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
@@ -273,14 +286,11 @@ int main(int argc, char **argv) {
   std::vector<std::string> KeptDSOSyms;
 
   for (unsigned i = BaseArg; i < InputFilenames.size(); ++i) {
-    std::string error;
-    std::unique_ptr<LTOModule> Module(
-        LTOModule::createFromFile(InputFilenames[i].c_str(), Options, error));
-    if (!error.empty()) {
-      errs() << argv[0] << ": error loading file '" << InputFilenames[i]
-             << "': " << error << "\n";
-      return 1;
-    }
+    CurrentActivity = "loading file '" + InputFilenames[i] + "'";
+    ErrorOr<std::unique_ptr<LTOModule>> ModuleOrErr =
+        LTOModule::createFromFile(Context, InputFilenames[i].c_str(), Options);
+    std::unique_ptr<LTOModule> &Module = *ModuleOrErr;
+    CurrentActivity = "";
 
     unsigned NumSyms = Module->getSymbolCount();
     for (unsigned I = 0; I < NumSyms; ++I) {
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 3fea0b2a4eee..61567436a1cc 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -1495,11 +1495,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
       printArchiveHeaders(A, !NonVerbose, ArchiveMemberOffsets);
     for (Archive::child_iterator I = A->child_begin(), E = A->child_end();
          I != E; ++I) {
-      if (std::error_code EC = I->getError()) {
-        errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-               << ".\n";
-        exit(1);
-      }
+      if (std::error_code EC = I->getError())
+        report_error(Filename, EC);
       auto &C = I->get();
       ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
       if (ChildOrErr.getError())
@@ -1549,11 +1546,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
               for (Archive::child_iterator AI = A->child_begin(),
                                            AE = A->child_end();
                    AI != AE; ++AI) {
-                if (std::error_code EC = AI->getError()) {
-                  errs() << "llvm-objdump: '" << Filename
-                         << "': " << EC.message() << ".\n";
-                  exit(1);
-                }
+                if (std::error_code EC = AI->getError())
+                  report_error(Filename, EC);
                 auto &C = AI->get();
                 ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
                 if (ChildOrErr.getError())
@@ -1597,11 +1591,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
             for (Archive::child_iterator AI = A->child_begin(),
                                          AE = A->child_end();
                  AI != AE; ++AI) {
-              if (std::error_code EC = AI->getError()) {
-                errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-                       << ".\n";
-                exit(1);
-              }
+              if (std::error_code EC = AI->getError())
+                report_error(Filename, EC);
               auto &C = AI->get();
               ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
               if (ChildOrErr.getError())
@@ -1639,11 +1630,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
           printArchiveHeaders(A.get(), !NonVerbose, ArchiveMemberOffsets);
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
-          if (std::error_code EC = AI->getError()) {
-            errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-                   << ".\n";
-            exit(1);
-          }
+          if (std::error_code EC = AI->getError())
+            report_error(Filename, EC);
           auto &C = AI->get();
           ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
           if (ChildOrErr.getError())
@@ -7608,26 +7596,11 @@ static void PrintUuidLoadCommand(MachO::uuid_command uuid) {
   else
     outs() << "\n";
   outs() << "    uuid ";
-  outs() << format("%02" PRIX32, uuid.uuid[0]);
-  outs() << format("%02" PRIX32, uuid.uuid[1]);
-  outs() << format("%02" PRIX32, uuid.uuid[2]);
-  outs() << format("%02" PRIX32, uuid.uuid[3]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[4]);
-  outs() << format("%02" PRIX32, uuid.uuid[5]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[6]);
-  outs() << format("%02" PRIX32, uuid.uuid[7]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[8]);
-  outs() << format("%02" PRIX32, uuid.uuid[9]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[10]);
-  outs() << format("%02" PRIX32, uuid.uuid[11]);
-  outs() << format("%02" PRIX32, uuid.uuid[12]);
-  outs() << format("%02" PRIX32, uuid.uuid[13]);
-  outs() << format("%02" PRIX32, uuid.uuid[14]);
-  outs() << format("%02" PRIX32, uuid.uuid[15]);
+  for (int i = 0; i < 16; ++i) {
+    outs() << format("%02" PRIX32, uuid.uuid[i]);
+    if (i == 3 || i == 5 || i == 7 || i == 9)
+      outs() << "-";
+  }
   outs() << "\n";
 }
 
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 54f24d7a00ff..a2c43e11a78e 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -252,7 +252,7 @@ void llvm::error(std::error_code EC) {
   exit(1);
 }
 
-static void report_error(StringRef File, std::error_code EC) {
+void llvm::report_error(StringRef File, std::error_code EC) {
   assert(EC);
   errs() << ToolName << ": '" << File << "': " << EC.message() << ".\n";
   exit(1);
@@ -282,10 +282,8 @@ static const Target *getTarget(const ObjectFile *Obj = nullptr) {
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
                                                          Error);
-  if (!TheTarget) {
-    errs() << ToolName << ": " << Error;
-    return nullptr;
-  }
+  if (!TheTarget)
+    report_fatal_error("can't find target: " + Error);
 
   // Update the triple name and return the found target.
   TripleName = TheTriple.getTriple();
@@ -805,10 +803,6 @@ static bool getHidden(RelocationRef RelRef) {
 
 static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   const Target *TheTarget = getTarget(Obj);
-  // getTarget() will have already issued a diagnostic if necessary, so
-  // just bail here if it failed.
-  if (!TheTarget)
-    return;
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index 3c5e7b365825..f74ed010d1d1 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -78,6 +78,7 @@ void PrintRelocations(const object::ObjectFile *o);
 void PrintSectionHeaders(const object::ObjectFile *o);
 void PrintSectionContents(const object::ObjectFile *o);
 void PrintSymbolTable(const object::ObjectFile *o);
+void report_error(StringRef File, std::error_code EC);
 
 } // end namespace llvm
 
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index ef07a85f885d..02397f382848 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -433,17 +433,11 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
   if (entry.isVerdef()) {
     // The first Verdaux entry holds the name.
     name_offset = entry.getVerdef()->getAux()->vda_name;
-  } else {
-    name_offset = entry.getVernaux()->vna_name;
-  }
-
-  // Set IsDefault
-  if (entry.isVerdef()) {
     IsDefault = !(vs->vs_index & ELF::VERSYM_HIDDEN);
   } else {
+    name_offset = entry.getVernaux()->vna_name;
     IsDefault = false;
   }
-
   if (name_offset >= StrTab.size())
     reportError("Invalid string offset");
   return StringRef(StrTab.data() + name_offset);
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 63cec03438cc..2a75ababb2e8 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -231,7 +231,7 @@ namespace opts {
 
 namespace llvm {
 
-void reportError(Twine Msg) {
+LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
   outs() << "\nError reading file: " << Msg << ".\n";
   outs().flush();
   exit(1);
@@ -296,10 +296,8 @@ static std::error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
 static void dumpObject(const ObjectFile *Obj) {
   StreamWriter Writer(outs());
   std::unique_ptr<ObjDumper> Dumper;
-  if (std::error_code EC = createDumper(Obj, Writer, Dumper)) {
+  if (std::error_code EC = createDumper(Obj, Writer, Dumper))
     reportError(Obj->getFileName(), EC);
-    return;
-  }
 
   outs() << '\n';
   outs() << "File: " << Obj->getFileName() << "\n";
@@ -414,10 +412,8 @@ static void dumpInput(StringRef File) {
 
   // Attempt to open the binary.
   ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
-  if (std::error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError())
     reportError(File, EC);
-    return;
-  }
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (Archive *Arc = dyn_cast<Archive>(&Binary))
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 58c50f58d750..5a103920c165 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -11,6 +11,7 @@
 #define LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -19,7 +20,7 @@ namespace llvm {
   }
 
   // Various helper functions.
-  void reportError(Twine Msg);
+  LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg);
   void error(std::error_code ec);
   bool relocAddressLess(object::RelocationRef A,
                         object::RelocationRef B);
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 59c9a0c990f3..6ee3a44b63bf 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -388,11 +388,6 @@ static int executeInput() {
   doPreallocation(MemMgr);
   RuntimeDyld Dyld(MemMgr, MemMgr);
 
-  // FIXME: Preserve buffers until resolveRelocations time to work around a bug
-  //        in RuntimeDyldELF.
-  // This fixme should be fixed ASAP. This is a very brittle workaround.
-  std::vector<std::unique_ptr<MemoryBuffer>> InputBuffers;
-
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
@@ -409,7 +404,6 @@ static int executeInput() {
       return Error("unable to create object file: '" + EC.message() + "'");
 
     ObjectFile &Obj = **MaybeObj;
-    InputBuffers.push_back(std::move(*InputBuffer));
 
     // Load the object file
     Dyld.loadObject(Obj);
@@ -656,11 +650,6 @@ static int linkAndVerify() {
   RuntimeDyldChecker Checker(Dyld, Disassembler.get(), InstPrinter.get(),
                              llvm::dbgs());
 
-  // FIXME: Preserve buffers until resolveRelocations time to work around a bug
-  //        in RuntimeDyldELF.
-  // This fixme should be fixed ASAP. This is a very brittle workaround.
-  std::vector<std::unique_ptr<MemoryBuffer>> InputBuffers;
-
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
@@ -679,7 +668,6 @@ static int linkAndVerify() {
       return Error("unable to create object file: '" + EC.message() + "'");
 
     ObjectFile &Obj = **MaybeObj;
-    InputBuffers.push_back(std::move(*InputBuffer));
 
     // Load the object file
     Dyld.loadObject(Obj);
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index ee389da2499e..d8f99c050a34 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -15,6 +15,8 @@
 #include "llvm-c/lto.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/LTO/LTOModule.h"
@@ -64,6 +66,24 @@ static bool initialized = false;
 // Holds the command-line option parsing state of the LTO module.
 static bool parsedOptions = false;
 
+static LLVMContext *LTOContext = nullptr;
+
+static void diagnosticHandler(const DiagnosticInfo &DI, void *Context) {
+  if (DI.getSeverity() != DS_Error) {
+    DiagnosticPrinterRawOStream DP(errs());
+    DI.print(DP);
+    errs() << '\n';
+    return;
+  }
+  sLastErrorString = "";
+  {
+    raw_string_ostream Stream(sLastErrorString);
+    DiagnosticPrinterRawOStream DP(Stream);
+    DI.print(DP);
+  }
+  sLastErrorString += '\n';
+}
+
 // Initialize the configured targets if they have not been initialized.
 static void lto_initialize() {
   if (!initialized) {
@@ -79,6 +99,9 @@ static void lto_initialize() {
     InitializeAllAsmParsers();
     InitializeAllAsmPrinters();
     InitializeAllDisassemblers();
+
+    LTOContext = &getGlobalContext();
+    LTOContext->setDiagnosticHandler(diagnosticHandler, nullptr, true);
     initialized = true;
   }
 }
@@ -95,13 +118,18 @@ static void handleLibLTODiagnostic(lto_codegen_diagnostic_severity_t Severity,
 // libLTO API semantics, which require that the code generator owns the object
 // file.
 struct LibLTOCodeGenerator : LTOCodeGenerator {
-  LibLTOCodeGenerator() {
+  LibLTOCodeGenerator() : LTOCodeGenerator(*LTOContext) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
   LibLTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
-      : LTOCodeGenerator(std::move(Context)) {
+      : LTOCodeGenerator(*Context), OwnedContext(std::move(Context)) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
 
+  // Reset the module first in case MergedModule is created in OwnedContext.
+  // Module must be destructed before its context gets destructed.
+  ~LibLTOCodeGenerator() { resetMergedModule(); }
+
   std::unique_ptr<MemoryBuffer> NativeObjectFile;
+  std::unique_ptr<LLVMContext> OwnedContext;
 };
 
 }
@@ -165,14 +193,21 @@ lto_module_is_object_file_in_memory_for_target(const void* mem,
 lto_module_t lto_module_create(const char* path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromFile(path, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromFile(*LTOContext, path, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_fd(int fd, const char *path, size_t size) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(
-      LTOModule::createFromOpenFile(fd, path, size, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromOpenFile(*LTOContext, fd, path, size, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
@@ -181,14 +216,21 @@ lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
                                                  off_t offset) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromOpenFileSlice(fd, path, map_size, offset,
-                                                 Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M = LTOModule::createFromOpenFileSlice(
+      *LTOContext, fd, path, map_size, offset, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_memory(const void* mem, size_t length) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromBuffer(mem, length, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromBuffer(*LTOContext, mem, length, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_memory_with_path(const void* mem,
@@ -196,16 +238,22 @@ lto_module_t lto_module_create_from_memory_with_path(const void* mem,
                                                      const char *path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(
-      LTOModule::createFromBuffer(mem, length, Options, sLastErrorString, path));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromBuffer(*LTOContext, mem, length, Options, path);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_in_local_context(const void *mem, size_t length,
                                                 const char *path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createInLocalContext(mem, length, Options,
-                                              sLastErrorString, path));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createInLocalContext(mem, length, Options, path);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_in_codegen_context(const void *mem,
@@ -214,8 +262,9 @@ lto_module_t lto_module_create_in_codegen_context(const void *mem,
                                                   lto_code_gen_t cg) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createInContext(mem, length, Options, sLastErrorString,
-                                         path, &unwrap(cg)->getContext()));
+  ErrorOr<std::unique_ptr<LTOModule>> M = LTOModule::createInContext(
+      mem, length, Options, path, &unwrap(cg)->getContext());
+  return wrap(M->release());
 }
 
 void lto_module_dispose(lto_module_t mod) { delete unwrap(mod); }
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 5fe2f034c6e2..fe1605aa8436 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -36,7 +37,6 @@
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -51,6 +51,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <memory>
 using namespace llvm;
@@ -190,6 +191,11 @@ static cl::opt<bool> PreserveAssemblyUseListOrder(
     cl::desc("Preserve use-list order when writing LLVM assembly."),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    RunTwice("run-twice",
+             cl::desc("Run all passes twice, re-using the same pass manager."),
+             cl::init(false), cl::Hidden);
+
 static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
@@ -582,22 +588,61 @@ int main(int argc, char **argv) {
   if (!NoVerify && !VerifyEach)
     Passes.add(createVerifierPass());
 
+  // In run twice mode, we want to make sure the output is bit-by-bit
+  // equivalent if we run the pass manager again, so setup two buffers and
+  // a stream to write to them. Note that llc does something similar and it
+  // may be worth to abstract this out in the future.
+  SmallVector<char, 0> Buffer;
+  SmallVector<char, 0> CompileTwiceBuffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  raw_ostream *OS = nullptr;
+
   // Write bitcode or assembly to the output as the last step...
   if (!NoOutput && !AnalyzeOnly) {
+    assert(Out);
+    OS = &Out->os();
+    if (RunTwice) {
+      BOS = make_unique<raw_svector_ostream>(Buffer);
+      OS = BOS.get();
+    }
     if (OutputAssembly)
-      Passes.add(
-          createPrintModulePass(Out->os(), "", PreserveAssemblyUseListOrder));
+      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
     else
-      Passes.add(
-          createBitcodeWriterPass(Out->os(), PreserveBitcodeUseListOrder));
+      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
   }
 
   // Before executing passes, print the final values of the LLVM options.
   cl::PrintOptionValues();
 
+  // If requested, run all passes again with the same pass manager to catch
+  // bugs caused by persistent state in the passes
+  if (RunTwice) {
+      std::unique_ptr<Module> M2(CloneModule(M.get()));
+      Passes.run(*M2);
+      CompileTwiceBuffer = Buffer;
+      Buffer.clear();
+  }
+
   // Now that we have all of the passes ready, run them.
   Passes.run(*M);
 
+  // Compare the two outputs and make sure they're the same
+  if (RunTwice) {
+    assert(Out);
+    if (Buffer.size() != CompileTwiceBuffer.size() ||
+        (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
+         0)) {
+      errs() << "Running the pass manager twice changed the output.\n"
+                "Writing the result of the second run to the specified output.\n"
+                "To generate the one-run comparison binary, just run without\n"
+                "the compile-twice option\n";
+      Out->os() << BOS->str();
+      Out->keep();
+      return 1;
+    }
+    Out->os() << BOS->str();
+  }
+
   // Declare success.
   if (!NoOutput || PrintBreakpoints)
     Out->keep();
diff --git a/tools/sancov/CMakeLists.txt b/tools/sancov/CMakeLists.txt
index f891f779e8c9..e92b1fcbb862 100644
--- a/tools/sancov/CMakeLists.txt
+++ b/tools/sancov/CMakeLists.txt
@@ -1,6 +1,12 @@
 set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmPrinters
+  AllTargetsDescs
+  AllTargetsDisassemblers
+  AllTargetsInfos
   DebugInfoDWARF
   DebugInfoPDB
+  MC
+  MCDisassembler
   Object
   Support
   Symbolize
diff --git a/tools/sancov/Makefile b/tools/sancov/Makefile
index 1114fe052d4d..7dba1a7a594a 100644
--- a/tools/sancov/Makefile
+++ b/tools/sancov/Makefile
@@ -9,7 +9,8 @@
 
 LEVEL := ../..
 TOOLNAME := sancov
-LINK_COMPONENTS := DebugInfoDWARF DebugInfoPDB Object Support Symbolize
+LINK_COMPONENTS := all-targets DebugInfoDWARF DebugInfoPDB MC MCParser \
+  MCDisassembler Object Support Symbolize
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/sancov/sancov.cc b/tools/sancov/sancov.cc
index 9b54575028ad..450c21bd1690 100644
--- a/tools/sancov/sancov.cc
+++ b/tools/sancov/sancov.cc
@@ -12,6 +12,18 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorOr.h"
@@ -22,11 +34,14 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <set>
 #include <stdio.h>
+#include <string>
 #include <vector>
 
 using namespace llvm;
@@ -35,13 +50,19 @@ namespace {
 
 // --------- COMMAND LINE FLAGS ---------
 
-enum ActionType { PrintAction, CoveredFunctionsAction };
+enum ActionType {
+  PrintAction,
+  CoveredFunctionsAction,
+  NotCoveredFunctionsAction
+};
 
 cl::opt<ActionType> Action(
     cl::desc("Action (required)"), cl::Required,
     cl::values(clEnumValN(PrintAction, "print", "Print coverage addresses"),
-               clEnumValN(CoveredFunctionsAction, "covered_functions",
+               clEnumValN(CoveredFunctionsAction, "covered-functions",
                           "Print all covered funcions."),
+               clEnumValN(NotCoveredFunctionsAction, "not-covered-functions",
+                          "Print all not covered funcions."),
                clEnumValEnd));
 
 static cl::list<std::string> ClInputFiles(cl::Positional, cl::OneOrMore,
@@ -55,6 +76,10 @@ static cl::opt<bool>
     ClDemangle("demangle", cl::init(true),
         cl::desc("Print demangled function name."));
 
+static cl::opt<std::string> ClStripPathPrefix(
+    "strip_path_prefix", cl::init(""),
+    cl::desc("Strip this prefix from file paths in reports."));
+
 // --------- FORMAT SPECIFICATION ---------
 
 struct FileHeader {
@@ -68,31 +93,256 @@ static const uint32_t Bitness64 = 0xFFFFFF64;
 
 // ---------
 
+static void FailIfError(std::error_code Error) {
+  if (!Error)
+    return;
+  errs() << "Error: " << Error.message() << "(" << Error.value() << ")\n";
+  exit(1);
+}
+
 template <typename T> static void FailIfError(const ErrorOr<T> &E) {
-  if (E)
+  FailIfError(E.getError());
+}
+
+static void FailIfNotEmpty(const std::string &E) {
+  if (E.empty())
     return;
+  errs() << "Error: " << E << "\n";
+  exit(1);
+}
 
-  auto Error = E.getError();
-  errs() << "Error: " << Error.message() << "(" << Error.value() << ")\n";
-  exit(-2);
+template <typename T>
+static void FailIfEmpty(const std::unique_ptr<T> &Ptr,
+                        const std::string &Message) {
+  if (Ptr.get())
+    return;
+  errs() << "Error: " << Message << "\n";
+  exit(1);
 }
 
 template <typename T>
 static void readInts(const char *Start, const char *End,
-                     std::vector<uint64_t> *V) {
+                     std::set<uint64_t> *Ints) {
   const T *S = reinterpret_cast<const T *>(Start);
   const T *E = reinterpret_cast<const T *>(End);
-  V->reserve(E - S);
-  std::copy(S, E, std::back_inserter(*V));
+  std::copy(S, E, std::inserter(*Ints, Ints->end()));
+}
+
+struct FileLoc {
+  bool operator<(const FileLoc &RHS) const {
+    return std::tie(FileName, Line) < std::tie(RHS.FileName, RHS.Line);
+  }
+
+  std::string FileName;
+  uint32_t Line;
+};
+
+struct FunctionLoc {
+  bool operator<(const FunctionLoc &RHS) const {
+    return std::tie(Loc, FunctionName) < std::tie(RHS.Loc, RHS.FunctionName);
+  }
+
+  FileLoc Loc;
+  std::string FunctionName;
+};
+
+std::string stripPathPrefix(std::string Path) {
+  if (ClStripPathPrefix.empty())
+    return Path;
+  size_t Pos = Path.find(ClStripPathPrefix);
+  if (Pos == std::string::npos)
+    return Path;
+  return Path.substr(Pos + ClStripPathPrefix.size());
+}
+
+// Compute [FileLoc -> FunctionName] map for given addresses.
+static std::map<FileLoc, std::string>
+computeFunctionsMap(const std::set<uint64_t> &Addrs) {
+  std::map<FileLoc, std::string> Fns;
+
+  symbolize::LLVMSymbolizer::Options SymbolizerOptions;
+  SymbolizerOptions.Demangle = ClDemangle;
+  SymbolizerOptions.UseSymbolTable = true;
+  symbolize::LLVMSymbolizer Symbolizer(SymbolizerOptions);
+
+  // Fill in Fns map.
+  for (auto Addr : Addrs) {
+    auto InliningInfo = Symbolizer.symbolizeInlinedCode(ClBinaryName, Addr);
+    FailIfError(InliningInfo);
+    for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) {
+      auto FrameInfo = InliningInfo->getFrame(i);
+      SmallString<256> FileName(FrameInfo.FileName);
+      sys::path::remove_dots(FileName, /* remove_dot_dot */ true);
+      FileLoc Loc = {FileName.str(), FrameInfo.Line};
+      Fns[Loc] = FrameInfo.FunctionName;
+    }
+  }
+
+  return Fns;
+}
+
+// Compute functions for given addresses. It keeps only the first
+// occurence of a function within a file.
+std::set<FunctionLoc> computeFunctionLocs(const std::set<uint64_t> &Addrs) {
+  std::map<FileLoc, std::string> Fns = computeFunctionsMap(Addrs);
+
+  std::set<FunctionLoc> result;
+  std::string LastFileName;
+  std::set<std::string> ProcessedFunctions;
+
+  for (const auto &P : Fns) {
+    std::string FileName = P.first.FileName;
+    std::string FunctionName = P.second;
+
+    if (LastFileName != FileName)
+      ProcessedFunctions.clear();
+    LastFileName = FileName;
+
+    if (!ProcessedFunctions.insert(FunctionName).second)
+      continue;
+
+    result.insert(FunctionLoc{P.first, P.second});
+  }
+
+  return result;
+}
+
+// Locate __sanitizer_cov function address.
+static uint64_t findSanitizerCovFunction(const object::ObjectFile &O) {
+  for (const object::SymbolRef &Symbol : O.symbols()) {
+    ErrorOr<uint64_t> AddressOrErr = Symbol.getAddress();
+    FailIfError(AddressOrErr);
+
+    ErrorOr<StringRef> Name = Symbol.getName();
+    FailIfError(Name);
+
+    if (Name.get() == "__sanitizer_cov") {
+      return AddressOrErr.get();
+    }
+  }
+  FailIfNotEmpty("__sanitizer_cov not found");
+  return 0; // not reachable.
+}
+
+// Locate addresses of all coverage points in a file. Coverage point
+// is defined as the 'address of instruction following __sanitizer_cov
+// call - 1'.
+static void getObjectCoveragePoints(const object::ObjectFile &O,
+                                    std::set<uint64_t> *Addrs) {
+  Triple TheTriple("unknown-unknown-unknown");
+  TheTriple.setArch(Triple::ArchType(O.getArch()));
+  auto TripleName = TheTriple.getTriple();
+
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, Error);
+  FailIfNotEmpty(Error);
+
+  std::unique_ptr<const MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  FailIfEmpty(STI, "no subtarget info for target " + TripleName);
+
+  std::unique_ptr<const MCRegisterInfo> MRI(
+      TheTarget->createMCRegInfo(TripleName));
+  FailIfEmpty(MRI, "no register info for target " + TripleName);
+
+  std::unique_ptr<const MCAsmInfo> AsmInfo(
+      TheTarget->createMCAsmInfo(*MRI, TripleName));
+  FailIfEmpty(AsmInfo, "no asm info for target " + TripleName);
+
+  std::unique_ptr<const MCObjectFileInfo> MOFI(new MCObjectFileInfo);
+  MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
+  std::unique_ptr<MCDisassembler> DisAsm(
+      TheTarget->createMCDisassembler(*STI, Ctx));
+  FailIfEmpty(DisAsm, "no disassembler info for target " + TripleName);
+
+  std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  FailIfEmpty(MII, "no instruction info for target " + TripleName);
+
+  std::unique_ptr<const MCInstrAnalysis> MIA(
+      TheTarget->createMCInstrAnalysis(MII.get()));
+  FailIfEmpty(MIA, "no instruction analysis info for target " + TripleName);
+
+  uint64_t SanCovAddr = findSanitizerCovFunction(O);
+
+  for (const auto Section : O.sections()) {
+    if (Section.isVirtual() || !Section.isText()) // llvm-objdump does the same.
+      continue;
+    uint64_t SectionAddr = Section.getAddress();
+    uint64_t SectSize = Section.getSize();
+    if (!SectSize)
+      continue;
+
+    StringRef SectionName;
+    FailIfError(Section.getName(SectionName));
+
+    StringRef BytesStr;
+    FailIfError(Section.getContents(BytesStr));
+    ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(BytesStr.data()),
+                            BytesStr.size());
+
+    for (uint64_t Index = 0, Size = 0; Index < Section.getSize();
+         Index += Size) {
+      MCInst Inst;
+      if (!DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
+                                  SectionAddr + Index, nulls(), nulls())) {
+        if (Size == 0)
+          Size = 1;
+        continue;
+      }
+      uint64_t Target;
+      if (MIA->isCall(Inst) &&
+          MIA->evaluateBranch(Inst, SectionAddr + Index, Size, Target)) {
+        if (Target == SanCovAddr) {
+          // Sanitizer coverage uses the address of the next instruction - 1.
+          Addrs->insert(Index + SectionAddr + Size - 1);
+        }
+      }
+    }
+  }
 }
 
-static std::string CommonPrefix(std::string A, std::string B) {
-  if (A.size() > B.size())
-    return std::string(B.begin(),
-                       std::mismatch(B.begin(), B.end(), A.begin()).first);
+static void getArchiveCoveragePoints(const object::Archive &A,
+                                     std::set<uint64_t> *Addrs) {
+  for (auto &ErrorOrChild : A.children()) {
+    FailIfError(ErrorOrChild);
+    const object::Archive::Child &C = *ErrorOrChild;
+    ErrorOr<std::unique_ptr<object::Binary>> ChildOrErr = C.getAsBinary();
+    FailIfError(ChildOrErr);
+    if (object::ObjectFile *O =
+            dyn_cast<object::ObjectFile>(&*ChildOrErr.get()))
+      getObjectCoveragePoints(*O, Addrs);
+    else
+      FailIfError(object::object_error::invalid_file_type);
+  }
+}
+
+// Locate addresses of all coverage points in a file. Coverage point
+// is defined as the 'address of instruction following __sanitizer_cov
+// call - 1'.
+std::set<uint64_t> getCoveragePoints(std::string FileName) {
+  std::set<uint64_t> Result;
+
+  ErrorOr<object::OwningBinary<object::Binary>> BinaryOrErr =
+      object::createBinary(FileName);
+  FailIfError(BinaryOrErr);
+
+  object::Binary &Binary = *BinaryOrErr.get().getBinary();
+  if (object::Archive *A = dyn_cast<object::Archive>(&Binary))
+    getArchiveCoveragePoints(*A, &Result);
+  else if (object::ObjectFile *O = dyn_cast<object::ObjectFile>(&Binary))
+    getObjectCoveragePoints(*O, &Result);
   else
-    return std::string(A.begin(),
-                       std::mismatch(A.begin(), A.end(), B.begin()).first);
+    FailIfError(object::object_error::invalid_file_type);
+
+  return Result;
+}
+
+static void printFunctionLocs(const std::set<FunctionLoc> &FnLocs,
+                              raw_ostream &OS) {
+  for (const FunctionLoc &FnLoc : FnLocs) {
+    OS << stripPathPrefix(FnLoc.Loc.FileName) << ":" << FnLoc.Loc.Line << " "
+       << FnLoc.FunctionName << "\n";
+  }
 }
 
 class CoverageData {
@@ -116,7 +366,7 @@ class CoverageData {
       return make_error_code(errc::illegal_byte_sequence);
     }
 
-    auto Addrs = llvm::make_unique<std::vector<uint64_t>>();
+    auto Addrs = llvm::make_unique<std::set<uint64_t>>();
 
     switch (Header->Bitness) {
     case Bitness64:
@@ -138,15 +388,12 @@ class CoverageData {
   // Merge multiple coverage data together.
   static std::unique_ptr<CoverageData>
   merge(const std::vector<std::unique_ptr<CoverageData>> &Covs) {
-    std::set<uint64_t> Addrs;
+    auto Addrs = llvm::make_unique<std::set<uint64_t>>();
 
     for (const auto &Cov : Covs)
-      Addrs.insert(Cov->Addrs->begin(), Cov->Addrs->end());
+      Addrs->insert(Cov->Addrs->begin(), Cov->Addrs->end());
 
-    auto AddrsVector = llvm::make_unique<std::vector<uint64_t>>(
-        Addrs.begin(), Addrs.end());
-    return std::unique_ptr<CoverageData>(
-        new CoverageData(std::move(AddrsVector)));
+    return std::unique_ptr<CoverageData>(new CoverageData(std::move(Addrs)));
   }
 
   // Read list of files and merges their coverage info.
@@ -163,83 +410,39 @@ class CoverageData {
   }
 
   // Print coverage addresses.
-  void printAddrs(raw_ostream &out) {
+  void printAddrs(raw_ostream &OS) {
     for (auto Addr : *Addrs) {
-      out << "0x";
-      out.write_hex(Addr);
-      out << "\n";
+      OS << "0x";
+      OS.write_hex(Addr);
+      OS << "\n";
     }
   }
 
   // Print list of covered functions.
   // Line format: <file_name>:<line> <function_name>
-  void printCoveredFunctions(raw_ostream &out) {
-    if (Addrs->empty())
-      return;
-    symbolize::LLVMSymbolizer::Options SymbolizerOptions;
-    SymbolizerOptions.Demangle = ClDemangle;
-    symbolize::LLVMSymbolizer Symbolizer(SymbolizerOptions);
-
-    struct FileLoc {
-      std::string FileName;
-      uint32_t Line;
-      bool operator<(const FileLoc &Rhs) const {
-        return std::tie(FileName, Line) < std::tie(Rhs.FileName, Rhs.Line);
-      }
-    };
-
-    // FileLoc -> FunctionName
-    std::map<FileLoc, std::string> Fns;
-
-    // Fill in Fns map.
-    for (auto Addr : *Addrs) {
-      auto InliningInfo = Symbolizer.symbolizeInlinedCode(ClBinaryName, Addr);
-      FailIfError(InliningInfo);
-      for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) {
-        auto FrameInfo = InliningInfo->getFrame(i);
-        SmallString<256> FileName(FrameInfo.FileName);
-        sys::path::remove_dots(FileName, /* remove_dot_dot */ true);
-        FileLoc Loc = { FileName.str(), FrameInfo.Line };
-        Fns[Loc] = FrameInfo.FunctionName;
-      }
-    }
-
-    // Compute file names common prefix.
-    std::string FilePrefix = Fns.begin()->first.FileName;
-    for (const auto &P : Fns)
-      FilePrefix = CommonPrefix(FilePrefix, P.first.FileName);
-
-    // Print first function occurence in a file.
-    {
-      std::string LastFileName;
-      std::set<std::string> ProcessedFunctions;
-
-      for (const auto &P : Fns) {
-        std::string FileName = P.first.FileName;
-        std::string FunctionName = P.second;
-        uint32_t Line = P.first.Line;
-
-        if (LastFileName != FileName)
-          ProcessedFunctions.clear();
-        LastFileName = FileName;
-
-        if (!ProcessedFunctions.insert(FunctionName).second)
-          continue;
-
-        // Don't strip prefix if we only have a single file.
-        if (FileName.size() > FilePrefix.size())
-          FileName = FileName.substr(FilePrefix.size());
+  void printCoveredFunctions(raw_ostream &OS) {
+    printFunctionLocs(computeFunctionLocs(*Addrs), OS);
+  }
 
-        out << FileName << ":" << Line << " " << FunctionName << "\n";
-      }
-    }
+  // Print list of not covered functions.
+  // Line format: <file_name>:<line> <function_name>
+  void printNotCoveredFunctions(raw_ostream &OS) {
+    std::set<FunctionLoc> AllFns =
+        computeFunctionLocs(getCoveragePoints(ClBinaryName));
+    std::set<FunctionLoc> CoveredFns = computeFunctionLocs(*Addrs);
+
+    std::set<FunctionLoc> NotCoveredFns;
+    std::set_difference(AllFns.begin(), AllFns.end(), CoveredFns.begin(),
+                        CoveredFns.end(),
+                        std::inserter(NotCoveredFns, NotCoveredFns.end()));
+    printFunctionLocs(NotCoveredFns, OS);
   }
 
- private:
-  explicit CoverageData(std::unique_ptr<std::vector<uint64_t>> Addrs)
+private:
+  explicit CoverageData(std::unique_ptr<std::set<uint64_t>> Addrs)
       : Addrs(std::move(Addrs)) {}
 
-  std::unique_ptr<std::vector<uint64_t>> Addrs;
+  std::unique_ptr<std::set<uint64_t>> Addrs;
 };
 } // namespace
 
@@ -249,6 +452,10 @@ int main(int argc, char **argv) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
   cl::ParseCommandLineOptions(argc, argv, "Sanitizer Coverage Processing Tool");
 
   auto CovData = CoverageData::readAndMerge(ClInputFiles);
@@ -263,5 +470,11 @@ int main(int argc, char **argv) {
     CovData.get()->printCoveredFunctions(outs());
     return 0;
   }
+  case NotCoveredFunctionsAction: {
+    CovData.get()->printNotCoveredFunctions(outs());
+    return 0;
+  }
   }
+
+  llvm_unreachable("unsupported action");
 }
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
index 49f4cc124f9e..a27e649b616f 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
@@ -16,22 +16,14 @@ using namespace llvm::orc;
 
 namespace {
 
-class DummyCallbackManager : public orc::JITCompileCallbackManagerBase {
+class DummyCallbackManager : public orc::JITCompileCallbackManager {
 public:
-  DummyCallbackManager()
-      : JITCompileCallbackManagerBase(0), NextStubAddress(0),
-        UniversalCompile([]() { return 0; }) {
-  }
-
-  CompileCallbackInfo getCompileCallback() override {
-    return CompileCallbackInfo(++NextStubAddress, UniversalCompile);
-  }
+  DummyCallbackManager() : JITCompileCallbackManager(0) { }
 public:
-  TargetAddress NextStubAddress;
-  CompileFtor UniversalCompile;
+  void grow() override { llvm_unreachable("not implemented"); }
 };
 
-class DummyStubsManager : public orc::IndirectStubsManagerBase {
+class DummyStubsManager : public orc::IndirectStubsManager {
 public:
   std::error_code createStub(StringRef StubName, TargetAddress InitAddr,
                              JITSymbolFlags Flags) override {
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 41b2307cadd8..c88c94f17b1c 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -157,21 +157,6 @@ class MockBaseLayer {
     resetExpectations();
   }
 
-  template <typename OwningMBSet>
-  void takeOwnershipOfBuffers(ObjSetHandleT H, OwningMBSet MBs) {
-    EXPECT_EQ(MockObjSetHandle, H);
-    EXPECT_EQ(MockBufferSet, *MBs);
-    LastCalled = "takeOwnershipOfBuffers";
-  }
-  void expectTakeOwnershipOfBuffers(ObjSetHandleT H, MockMemoryBufferSet *MBs) {
-    MockObjSetHandle = H;
-    MockBufferSet = *MBs;
-  }
-  void verifyTakeOwnershipOfBuffers() {
-    EXPECT_EQ("takeOwnershipOfBuffers", LastCalled);
-    resetExpectations();
-  }
-
 private:
   // Backing fields for remembering parameter/return values
   std::string LastCalled;
@@ -275,18 +260,6 @@ TEST(ObjectTransformLayerTest, Main) {
   T1.mapSectionAddress(H, Buffer, MockAddress);
   M.verifyMapSectionAddress();
 
-  // Test takeOwnershipOfBuffers, using unique pointer to buffer set
-  auto MockBufferSetPtr = llvm::make_unique<MockMemoryBufferSet>(366);
-  M.expectTakeOwnershipOfBuffers(H, MockBufferSetPtr.get());
-  T2.takeOwnershipOfBuffers(H, std::move(MockBufferSetPtr));
-  M.verifyTakeOwnershipOfBuffers();
-
-  // Test takeOwnershipOfBuffers, using naked pointer to buffer set
-  MockMemoryBufferSet MockBufferSet = 266;
-  M.expectTakeOwnershipOfBuffers(H, &MockBufferSet);
-  T1.takeOwnershipOfBuffers(H, &MockBufferSet);
-  M.verifyTakeOwnershipOfBuffers();
-
   // Verify transform getter (non-const)
   MockObjectFile Mutatee = 277;
   MockObjectFile *Out = T2.getTransform()(&Mutatee);
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 7741b448fa8d..0bf98f35b3c8 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -382,5 +382,56 @@ TEST(ConstantsTest, AliasCAPI) {
   ASSERT_EQ(unwrap<GlobalAlias>(AliasRef)->getAliasee(), Aliasee);
 }
 
+static std::string getNameOfType(Type *T) {
+  std::string S;
+  raw_string_ostream RSOS(S);
+  T->print(RSOS);
+  return S;
+}
+
+TEST(ConstantsTest, BuildConstantDataArrays) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  for (Type *T : {Type::getInt8Ty(Context), Type::getInt16Ty(Context),
+                  Type::getInt32Ty(Context), Type::getInt64Ty(Context)}) {
+    ArrayType *ArrayTy = ArrayType::get(T, 2);
+    Constant *Vals[] = {ConstantInt::get(T, 0), ConstantInt::get(T, 1)};
+    Constant *CDV = ConstantArray::get(ArrayTy, Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataArray>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+
+  for (Type *T : {Type::getHalfTy(Context), Type::getFloatTy(Context),
+                  Type::getDoubleTy(Context)}) {
+    ArrayType *ArrayTy = ArrayType::get(T, 2);
+    Constant *Vals[] = {ConstantFP::get(T, 0), ConstantFP::get(T, 1)};
+    Constant *CDV = ConstantArray::get(ArrayTy, Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataArray>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+}
+
+TEST(ConstantsTest, BuildConstantDataVectors) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  for (Type *T : {Type::getInt8Ty(Context), Type::getInt16Ty(Context),
+                  Type::getInt32Ty(Context), Type::getInt64Ty(Context)}) {
+    Constant *Vals[] = {ConstantInt::get(T, 0), ConstantInt::get(T, 1)};
+    Constant *CDV = ConstantVector::get(Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataVector>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+
+  for (Type *T : {Type::getHalfTy(Context), Type::getFloatTy(Context),
+                  Type::getDoubleTy(Context)}) {
+    Constant *Vals[] = {ConstantFP::get(T, 0), ConstantFP::get(T, 1)};
+    Constant *CDV = ConstantVector::get(Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataVector>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index da4271a30aea..257ab7204c6c 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -813,6 +813,14 @@ TEST_F(DILocationTest, getTemporary) {
   EXPECT_FALSE(L->isResolved());
 }
 
+TEST_F(DILocationTest, cloneTemporary) {
+  MDNode *N = MDNode::get(Context, None);
+  auto L = DILocation::getTemporary(Context, 2, 7, N);
+  EXPECT_TRUE(L->isTemporary());
+  auto L2 = L->clone();
+  EXPECT_TRUE(L2->isTemporary());
+}
+
 typedef MetadataTest GenericDINodeTest;
 
 TEST_F(GenericDINodeTest, get) {
@@ -1304,10 +1312,12 @@ TEST_F(DICompileUnitTest, get) {
   MDTuple *GlobalVariables = getTuple();
   MDTuple *ImportedEntities = getTuple();
   uint64_t DWOId = 0x10000000c0ffee;
+  MDTuple *Macros = getTuple();
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, DWOId);
+      RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, Macros,
+      DWOId);
 
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
   EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
@@ -1323,6 +1333,7 @@ TEST_F(DICompileUnitTest, get) {
   EXPECT_EQ(Subprograms, N->getSubprograms().get());
   EXPECT_EQ(GlobalVariables, N->getGlobalVariables().get());
   EXPECT_EQ(ImportedEntities, N->getImportedEntities().get());
+  EXPECT_EQ(Macros, N->getMacros().get());
   EXPECT_EQ(DWOId, N->getDWOId());
 
   TempDICompileUnit Temp = N->clone();
@@ -1340,6 +1351,7 @@ TEST_F(DICompileUnitTest, get) {
   EXPECT_EQ(Subprograms, Temp->getSubprograms().get());
   EXPECT_EQ(GlobalVariables, Temp->getGlobalVariables().get());
   EXPECT_EQ(ImportedEntities, Temp->getImportedEntities().get());
+  EXPECT_EQ(Macros, Temp->getMacros().get());
   EXPECT_EQ(DWOId, Temp->getDWOId());
 
   auto *TempAddress = Temp.get();
@@ -1364,7 +1376,7 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, nullptr, nullptr, ImportedEntities, DWOId);
+      RetainedTypes, nullptr, nullptr, ImportedEntities, nullptr, DWOId);
 
   auto *Subprograms = MDTuple::getDistinct(Context, None);
   EXPECT_EQ(nullptr, N->getSubprograms().get());
@@ -1379,6 +1391,13 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   EXPECT_EQ(GlobalVariables, N->getGlobalVariables().get());
   N->replaceGlobalVariables(nullptr);
   EXPECT_EQ(nullptr, N->getGlobalVariables().get());
+
+  auto *Macros = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getMacros().get());
+  N->replaceMacros(Macros);
+  EXPECT_EQ(Macros, N->getMacros().get());
+  N->replaceMacros(nullptr);
+  EXPECT_EQ(nullptr, N->getMacros().get());
 }
 
 typedef MetadataTest DISubprogramTest;
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 4eba718e2663..e56a692125ec 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -71,6 +71,8 @@ class LinkModuleTest : public testing::Test {
   BasicBlock *ExitBB;
 };
 
+static void expectNoDiags(const DiagnosticInfo &DI) { EXPECT_TRUE(false); }
+
 TEST_F(LinkModuleTest, BlockAddress) {
   IRBuilder<> Builder(EntryBB);
 
@@ -93,7 +95,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::linkModules(*LinkedModule, *M);
+  Linker::linkModules(*LinkedModule, *M, expectNoDiags);
 
   // Delete the original module.
   M.reset();
@@ -169,13 +171,13 @@ static Module *getInternal(LLVMContext &Ctx) {
 TEST_F(LinkModuleTest, EmptyModule) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::linkModules(*EmptyM, *InternalM);
+  Linker::linkModules(*EmptyM, *InternalM, expectNoDiags);
 }
 
 TEST_F(LinkModuleTest, EmptyModule2) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::linkModules(*InternalM, *EmptyM);
+  Linker::linkModules(*InternalM, *EmptyM, expectNoDiags);
 }
 
 TEST_F(LinkModuleTest, TypeMerge) {
diff --git a/unittests/ProfileData/CMakeLists.txt b/unittests/ProfileData/CMakeLists.txt
index 79137c9510ae..011f8c581792 100644
--- a/unittests/ProfileData/CMakeLists.txt
+++ b/unittests/ProfileData/CMakeLists.txt
@@ -7,4 +7,5 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(ProfileDataTests
   CoverageMappingTest.cpp
   InstrProfTest.cpp
+  SampleProfTest.cpp
   )
diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp
new file mode 100644
index 000000000000..aa1144d7913d
--- /dev/null
+++ b/unittests/ProfileData/SampleProfTest.cpp
@@ -0,0 +1,102 @@
+//===- unittest/ProfileData/SampleProfTest.cpp -------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include "gtest/gtest.h"
+
+#include <cstdarg>
+
+using namespace llvm;
+using namespace sampleprof;
+
+static ::testing::AssertionResult NoError(std::error_code EC) {
+  if (!EC)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << "error " << EC.value() << ": "
+                                       << EC.message();
+}
+
+namespace {
+
+struct SampleProfTest : ::testing::Test {
+  std::string Data;
+  std::unique_ptr<raw_ostream> OS;
+  std::unique_ptr<SampleProfileWriter> Writer;
+  std::unique_ptr<SampleProfileReader> Reader;
+
+  SampleProfTest()
+      : Data(), OS(new raw_string_ostream(Data)), Writer(), Reader() {}
+
+  void createWriter(SampleProfileFormat Format) {
+    auto WriterOrErr = SampleProfileWriter::create(OS, Format);
+    ASSERT_TRUE(NoError(WriterOrErr.getError()));
+    Writer = std::move(WriterOrErr.get());
+  }
+
+  void readProfile(std::unique_ptr<MemoryBuffer> &Profile) {
+    auto ReaderOrErr = SampleProfileReader::create(Profile, getGlobalContext());
+    ASSERT_TRUE(NoError(ReaderOrErr.getError()));
+    Reader = std::move(ReaderOrErr.get());
+  }
+
+  void testRoundTrip(SampleProfileFormat Format) {
+    createWriter(Format);
+
+    StringRef FooName("_Z3fooi");
+    FunctionSamples FooSamples;
+    FooSamples.addTotalSamples(7711);
+    FooSamples.addHeadSamples(610);
+    FooSamples.addBodySamples(1, 0, 610);
+
+    StringRef BarName("_Z3bari");
+    FunctionSamples BarSamples;
+    BarSamples.addTotalSamples(20301);
+    BarSamples.addHeadSamples(1437);
+    BarSamples.addBodySamples(1, 0, 1437);
+
+    StringMap<FunctionSamples> Profiles;
+    Profiles[FooName] = std::move(FooSamples);
+    Profiles[BarName] = std::move(BarSamples);
+
+    std::error_code EC;
+    EC = Writer->write(Profiles);
+    ASSERT_TRUE(NoError(EC));
+
+    Writer->getOutputStream().flush();
+
+    auto Profile = MemoryBuffer::getMemBufferCopy(Data);
+    readProfile(Profile);
+
+    EC = Reader->read();
+    ASSERT_TRUE(NoError(EC));
+
+    StringMap<FunctionSamples> &ReadProfiles = Reader->getProfiles();
+    ASSERT_EQ(2u, ReadProfiles.size());
+
+    FunctionSamples &ReadFooSamples = ReadProfiles[FooName];
+    ASSERT_EQ(7711u, ReadFooSamples.getTotalSamples());
+    ASSERT_EQ(610u, ReadFooSamples.getHeadSamples());
+
+    FunctionSamples &ReadBarSamples = ReadProfiles[BarName];
+    ASSERT_EQ(20301u, ReadBarSamples.getTotalSamples());
+    ASSERT_EQ(1437u, ReadBarSamples.getHeadSamples());
+  }
+};
+
+TEST_F(SampleProfTest, roundtrip_text_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Text);
+}
+
+TEST_F(SampleProfTest, roundtrip_binary_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Binary);
+}
+
+} // end anonymous namespace
diff --git a/unittests/Support/MathExtrasTest.cpp b/unittests/Support/MathExtrasTest.cpp
index ee2fc1b9034e..945d8322b259 100644
--- a/unittests/Support/MathExtrasTest.cpp
+++ b/unittests/Support/MathExtrasTest.cpp
@@ -197,23 +197,23 @@ void SaturatingAddTestHelper()
   bool ResultOverflowed;
 
   EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2)));
-  EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2), ResultOverflowed));
+  EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(Max, T(1)));
-  EXPECT_EQ(Max, SaturatingAdd(Max, T(1), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(Max, T(1), &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1)));
-  EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(T(1), Max));
-  EXPECT_EQ(Max, SaturatingAdd(T(1), Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(T(1), Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(Max, Max));
-  EXPECT_EQ(Max, SaturatingAdd(Max, Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(Max, Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 }
 
@@ -232,45 +232,45 @@ void SaturatingMultiplyTestHelper()
 
   // Test basic multiplication.
   EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3)));
-  EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3), ResultOverflowed));
+  EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2)));
-  EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2), ResultOverflowed));
+  EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   // Test multiplication by zero.
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max, ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max, &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   // Test multiplication by maximum value.
   EXPECT_EQ(Max, SaturatingMultiply(Max, T(2)));
-  EXPECT_EQ(Max, SaturatingMultiply(Max, T(2), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(Max, T(2), &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingMultiply(T(2), Max));
-  EXPECT_EQ(Max, SaturatingMultiply(T(2), Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(T(2), Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingMultiply(Max, Max));
-  EXPECT_EQ(Max, SaturatingMultiply(Max, Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(Max, Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   // Test interesting boundary conditions for algorithm -
@@ -286,11 +286,11 @@ void SaturatingMultiplyTestHelper()
 
       if(OverflowExpected) {
         EXPECT_EQ(Max, SaturatingMultiply(X, Y));
-        EXPECT_EQ(Max, SaturatingMultiply(X, Y, ResultOverflowed));
+        EXPECT_EQ(Max, SaturatingMultiply(X, Y, &ResultOverflowed));
         EXPECT_TRUE(ResultOverflowed);
       } else {
         EXPECT_EQ(X * Y, SaturatingMultiply(X, Y));
-        EXPECT_EQ(X * Y, SaturatingMultiply(X, Y, ResultOverflowed));
+        EXPECT_EQ(X * Y, SaturatingMultiply(X, Y, &ResultOverflowed));
         EXPECT_FALSE(ResultOverflowed);
       }
     }
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index e22573f886c3..25e322ee5a8e 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -436,7 +436,7 @@ class CloneModule : public ::testing::Test {
     IBuilder.CreateRetVoid();
   }
 
-  void CreateNewModule() { NewM = llvm::CloneModule(OldM); }
+  void CreateNewModule() { NewM = llvm::CloneModule(OldM).release(); }
 
   LLVMContext C;
   Module *OldM;
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 6c34f51900cb..f5c50c992a92 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -257,18 +257,16 @@ class CodeGenSchedModels {
   class_iterator classes_end() { return SchedClasses.end(); }
   const_class_iterator classes_end() const { return SchedClasses.end(); }
   iterator_range<class_iterator> classes() {
-   return iterator_range<class_iterator>(classes_begin(), classes_end());
+   return make_range(classes_begin(), classes_end());
   }
   iterator_range<const_class_iterator> classes() const {
-   return iterator_range<const_class_iterator>(classes_begin(), classes_end());
+   return make_range(classes_begin(), classes_end());
   }
   iterator_range<class_iterator> explicit_classes() {
-    return iterator_range<class_iterator>(
-        classes_begin(), classes_begin() + NumInstrSchedClasses);
+    return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
   }
   iterator_range<const_class_iterator> explicit_classes() const {
-    return iterator_range<const_class_iterator>(
-        classes_begin(), classes_begin() + NumInstrSchedClasses);
+    return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
   }
 
   Record *getModelOrItinDef(Record *ProcDef) const {
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 24b38514260c..cf4a0bbe5bd9 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -173,7 +173,7 @@ class CodeGenTarget {
   inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();}
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
   iterator_range<inst_iterator> instructions() const {
-    return iterator_range<inst_iterator>(inst_begin(), inst_end());
+    return make_range(inst_begin(), inst_end());
   }
 
 
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index a4302d09078b..a6583399fa20 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -74,7 +74,7 @@ class InstrInfoEmitter {
 
 static void PrintDefList(const std::vector<Record*> &Uses,
                          unsigned Num, raw_ostream &OS) {
-  OS << "static const uint16_t ImplicitList" << Num << "[] = { ";
+  OS << "static const MCPhysReg ImplicitList" << Num << "[] = { ";
   for (unsigned i = 0, e = Uses.size(); i != e; ++i)
     OS << getQualifiedName(Uses[i]) << ", ";
   OS << "0 };\n";
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
new file mode 100755
index 000000000000..830f25e5cf38
--- /dev/null
+++ b/utils/release/build_llvm_package.bat
@@ -0,0 +1,93 @@
+@echo off
+setlocal
+
+REM Script for building the LLVM installer on Windows,
+REM used for the the weekly snapshots at http://www.llvm.org/builds.
+REM
+REM Usage: build_llvm_package.bat <revision>
+
+REM Prerequisites:
+REM
+REM   Visual Studio 2013, CMake, Ninja, SVN, GNUWin32,
+REM   NSIS with the strlen_8192 patch,
+REM   Visual Studio 2013 SDK (for the clang-format plugin).
+
+
+REM You may need to modify the paths below:
+set vcdir=c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC
+set PATH=%PATH%;c:\gnuwin32\bin
+
+set revision=%1
+set branch=trunk
+set package_version=3.8.0-r%revision%
+set clang_format_vs_version=3.8.0.%revision%
+set build_dir=llvm_package_%revision%
+
+echo Branch: %branch%
+echo Revision: %revision%
+echo Package version: %package_version%
+echo Clang format plugin version: %clang_format_vs_version%
+echo Build dir: %build_dir%
+echo.
+pause
+
+mkdir %build_dir%
+cd %build_dir%
+
+echo Checking out %branch% at r%revision%...
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/llvm/%branch% llvm || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/cfe/%branch% llvm/tools/clang || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/clang-tools-extra/%branch% llvm/tools/clang/tools/extra || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/lld/%branch% llvm/tools/lld || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/compiler-rt/%branch% llvm/projects/compiler-rt || exit /b
+
+
+set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DLLVM_USE_CRT_RELEASE=MT -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version%
+
+REM TODO: Run all tests, including lld and compiler-rt.
+
+call "%vcdir%/vcvarsall.bat" x86
+set CC=
+set CXX=
+mkdir build32_stage0
+cd build32_stage0
+cmake -GNinja %cmake_flags% ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+cd..
+
+mkdir build32
+cd build32
+set CC=..\build32_stage0\bin\clang-cl
+set CXX=..\build32_stage0\bin\clang-cl
+cmake -GNinja %cmake_flags% -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+copy ..\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangFormat.vsix ClangFormat-r%revision%.vsix
+ninja package || exit /b
+cd ..
+
+
+call "%vcdir%/vcvarsall.bat" amd64
+set CC=
+set CXX=
+mkdir build64_stage0
+cd build64_stage0
+cmake -GNinja %cmake_flags%  ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+cd..
+
+mkdir build64
+cd build64
+set CC=..\build64_stage0\bin\clang-cl
+set CXX=..\build64_stage0\bin\clang-cl
+cmake -GNinja %cmake_flags% ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+ninja package || exit /b
+cd ..
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index e0983460bf24..bb1f7869467a 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -34,7 +34,7 @@ do_rt="yes"
 do_libs="yes"
 do_libunwind="yes"
 do_test_suite="yes"
-do_openmp="no"
+do_openmp="yes"
 BuildDir="`pwd`"
 use_autoconf="no"
 ExtraConfigureFlags=""
@@ -62,7 +62,7 @@ function usage() {
     echo " -no-libs             Disable check-out & build libcxx/libcxxabi/libunwind"
     echo " -no-libunwind        Disable check-out & build libunwind"
     echo " -no-test-suite       Disable check-out & build test-suite"
-    echo " -openmp              Check out and build the OpenMP run-time (experimental)"
+    echo " -no-openmp           Disable check-out & build libomp"
 }
 
 if [ `uname -s` = "Darwin" ]; then
@@ -143,8 +143,8 @@ while [ $# -gt 0 ]; do
         -no-test-suite )
             do_test_suite="no"
             ;;
-        -openmp )
-            do_openmp="yes"
+        -no-openmp )
+            do_openmp="no"
             ;;
         -help | --help | -h | --h | -\? )
             usage
@@ -293,6 +293,9 @@ function export_sources() {
     if [ -d $BuildDir/compiler-rt.src ] && [ ! -h compiler-rt ]; then
         ln -s ../../compiler-rt.src compiler-rt
     fi
+    if [ -d $BuildDir/openmp.src ] && [ ! -h openmp ]; then
+        ln -s ../../openmp.src openmp
+    fi
     if [ -d $BuildDir/libcxx.src ] && [ ! -h libcxx ]; then
         ln -s ../../libcxx.src libcxx
     fi
@@ -443,46 +446,6 @@ function package_release() {
     cd $cwd
 }
 
-# Build and package the OpenMP run-time. This is still experimental and not
-# meant for official testing in the release, but as a way for providing
-# binaries as a convenience to those who want to try it out.
-function build_OpenMP() {
-    cwd=`pwd`
-
-    rm -rf $BuildDir/Phase3/openmp
-    rm -rf $BuildDir/Phase3/openmp.install
-    mkdir -p $BuildDir/Phase3/openmp
-    cd $BuildDir/Phase3/openmp
-    clang=$BuildDir/Phase3/Release/llvmCore-$Release-$RC.install/usr/local/bin/clang
-
-    echo "#" cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
-            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
-            $BuildDir/openmp.src/runtime
-    cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
-            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
-            $BuildDir/openmp.src/runtime
-
-    echo "# Building OpenMP run-time"
-    echo "# ${MAKE} -j $NumJobs VERBOSE=1"
-    ${MAKE} -j $NumJobs VERBOSE=1
-    echo "# ${MAKE} libomp-micro-tests VERBOSE=1"
-    ${MAKE} libomp-micro-tests VERBOSE=1
-    echo "# ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install"
-    ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install
-
-    OpenMPPackage=OpenMP-$Release
-    if [ $RC != "final" ]; then
-        OpenMPPackage=$OpenMPPackage-$RC
-    fi
-    OpenMPPackage=$OpenMPPackage-$Triple
-
-    mv $BuildDir/Phase3/openmp.install/usr/local $BuildDir/$OpenMPPackage
-    cd $BuildDir
-    tar cvfJ $BuildDir/$OpenMPPackage.tar.xz $OpenMPPackage
-    mv $OpenMPPackage $BuildDir/Phase3/openmp.install/usr/local
-    cd $cwd
-}
-
 # Exit if any command fails
 # Note: pipefail is necessary for running build commands through
 # a pipe (i.e. it changes the output of ``false | tee /dev/null ; echo $?``)
@@ -594,10 +557,6 @@ for Flavor in $Flavors ; do
     fi
 done
 
-if [ $do_openmp = "yes" ]; then
-  build_OpenMP
-fi
-
 ) 2>&1 | tee $LogDir/testing.$Release-$RC.log
 
 package_release