diff --git a/CMakeLists.txt b/CMakeLists.txt index 4028096d030a..80ca91289752 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -377,6 +377,31 @@ option(LLVM_ENABLE_THREADS "Use threads if available." ON) option(LLVM_ENABLE_ZLIB "Use zlib for compression/decompression if available." ON) +set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.") + +find_package(Z3 4.7.1) + +if (LLVM_Z3_INSTALL_DIR) + if (NOT Z3_FOUND) + message(FATAL_ERROR "Z3 >= 4.7.1 has not been found in LLVM_Z3_INSTALL_DIR: ${LLVM_Z3_INSTALL_DIR}.") + endif() +endif() + +set(LLVM_ENABLE_Z3_SOLVER_DEFAULT "${Z3_FOUND}") + +option(LLVM_ENABLE_Z3_SOLVER + "Enable Support for the Z3 constraint solver in LLVM." + ${LLVM_ENABLE_Z3_SOLVER_DEFAULT} +) + +if (LLVM_ENABLE_Z3_SOLVER) + if (NOT Z3_FOUND) + message(FATAL_ERROR "LLVM_ENABLE_Z3_SOLVER cannot be enabled when Z3 is not available.") + endif() + + set(LLVM_WITH_Z3 1) +endif() + if( LLVM_TARGETS_TO_BUILD STREQUAL "all" ) set( LLVM_TARGETS_TO_BUILD ${LLVM_ALL_TARGETS} ) endif() diff --git a/cmake/modules/FindZ3.cmake b/cmake/modules/FindZ3.cmake new file mode 100644 index 000000000000..04294275535b --- /dev/null +++ b/cmake/modules/FindZ3.cmake @@ -0,0 +1,110 @@ +INCLUDE(CheckCXXSourceRuns) + +# Function to check Z3's version +function(check_z3_version z3_include z3_lib) + # The program that will be executed to print Z3's version. + file(WRITE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testz3.c + "#include + #include + int main() { + unsigned int major, minor, build, rev; + Z3_get_version(&major, &minor, &build, &rev); + printf(\"%u.%u.%u\", major, minor, build); + return 0; + }") + + # Get lib path + get_filename_component(z3_lib_path ${z3_lib} PATH) + + try_run( + Z3_RETURNCODE + Z3_COMPILED + ${CMAKE_BINARY_DIR} + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testz3.c + COMPILE_DEFINITIONS -I"${z3_include}" + LINK_LIBRARIES -L${z3_lib_path} -lz3 + RUN_OUTPUT_VARIABLE SRC_OUTPUT + ) + + if(Z3_COMPILED) + string(REGEX REPLACE "([0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*)" "\\1" + z3_version "${SRC_OUTPUT}") + set(Z3_VERSION_STRING ${z3_version} PARENT_SCOPE) + endif() +endfunction(check_z3_version) + +# Looking for Z3 in LLVM_Z3_INSTALL_DIR +find_path(Z3_INCLUDE_DIR NAMES z3.h + NO_DEFAULT_PATH + PATHS ${LLVM_Z3_INSTALL_DIR}/include + PATH_SUFFIXES libz3 z3 + ) + +find_library(Z3_LIBRARIES NAMES z3 libz3 + NO_DEFAULT_PATH + PATHS ${LLVM_Z3_INSTALL_DIR} + PATH_SUFFIXES lib bin + ) + +# If Z3 has not been found in LLVM_Z3_INSTALL_DIR look in the default directories +find_path(Z3_INCLUDE_DIR NAMES z3.h + PATH_SUFFIXES libz3 z3 + ) + +find_library(Z3_LIBRARIES NAMES z3 libz3 + PATH_SUFFIXES lib bin + ) + +# Searching for the version of the Z3 library is a best-effort task +unset(Z3_VERSION_STRING) + +# First, try to check it dynamically, by compiling a small program that +# prints Z3's version +if(Z3_INCLUDE_DIR AND Z3_LIBRARIES) + # We do not have the Z3 binary to query for a version. Try to use + # a small C++ program to detect it via the Z3_get_version() API call. + check_z3_version(${Z3_INCLUDE_DIR} ${Z3_LIBRARIES}) +endif() + +# If the dynamic check fails, we might be cross compiling: if that's the case, +# check the version in the headers, otherwise, fail with a message +if(NOT Z3_VERSION_STRING AND (CMAKE_CROSSCOMPILING AND + Z3_INCLUDE_DIR AND + EXISTS "${Z3_INCLUDE_DIR}/z3_version.h")) + # TODO: print message warning that we couldn't find a compatible lib? + + # Z3 4.8.1+ has the version is in a public header. + file(STRINGS "${Z3_INCLUDE_DIR}/z3_version.h" + z3_version_str REGEX "^#define[\t ]+Z3_MAJOR_VERSION[\t ]+.*") + string(REGEX REPLACE "^.*Z3_MAJOR_VERSION[\t ]+([0-9]).*$" "\\1" + Z3_MAJOR "${z3_version_str}") + + file(STRINGS "${Z3_INCLUDE_DIR}/z3_version.h" + z3_version_str REGEX "^#define[\t ]+Z3_MINOR_VERSION[\t ]+.*") + string(REGEX REPLACE "^.*Z3_MINOR_VERSION[\t ]+([0-9]).*$" "\\1" + Z3_MINOR "${z3_version_str}") + + file(STRINGS "${Z3_INCLUDE_DIR}/z3_version.h" + z3_version_str REGEX "^#define[\t ]+Z3_BUILD_NUMBER[\t ]+.*") + string(REGEX REPLACE "^.*Z3_BUILD_VERSION[\t ]+([0-9]).*$" "\\1" + Z3_BUILD "${z3_version_str}") + + set(Z3_VERSION_STRING ${Z3_MAJOR}.${Z3_MINOR}.${Z3_BUILD}) + unset(z3_version_str) +endif() + +if(NOT Z3_VERSION_STRING) + # Give up: we are unable to obtain a version of the Z3 library. Be + # conservative and force the found version to 0.0.0 to make version + # checks always fail. + set(Z3_VERSION_STRING "0.0.0") +endif() + +# handle the QUIETLY and REQUIRED arguments and set Z3_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(Z3 + REQUIRED_VARS Z3_LIBRARIES Z3_INCLUDE_DIR + VERSION_VAR Z3_VERSION_STRING) + +mark_as_advanced(Z3_INCLUDE_DIR Z3_LIBRARIES) diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in index c3c3af37f620..df97c723efaa 100644 --- a/cmake/modules/LLVMConfig.cmake.in +++ b/cmake/modules/LLVMConfig.cmake.in @@ -44,6 +44,8 @@ set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@) set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@) +set(LLVM_WITH_Z3 @LLVM_WITH_Z3@) + set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@) set(LLVM_NATIVE_ARCH @LLVM_NATIVE_ARCH@) diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst index 630e147af8b3..420f050debc8 100644 --- a/docs/AMDGPUUsage.rst +++ b/docs/AMDGPUUsage.rst @@ -323,62 +323,80 @@ is conservatively correct for OpenCL. .. table:: AMDHSA LLVM Sync Scopes :name: amdgpu-amdhsa-llvm-sync-scopes-table - ================ ========================================================== - LLVM Sync Scope Description - ================ ========================================================== - *none* The default: ``system``. - - Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``. - - ``agent`` and executed by a thread on the same agent. - - ``workgroup`` and executed by a thread in the same - workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``agent`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system`` or ``agent`` and executed by a thread on the - same agent. - - ``workgroup`` and executed by a thread in the same - workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``workgroup`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``, ``agent`` or ``workgroup`` and executed by a - thread in the same workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``wavefront`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``, ``agent``, ``workgroup`` or ``wavefront`` - and executed by a thread in the same wavefront. - - ``singlethread`` Only synchronizes with, and participates in modification - and seq_cst total orderings with, other operations (except - image operations) running in the same thread for all - address spaces (for example, in signal handlers). - ================ ========================================================== + ======================= =================================================== + LLVM Sync Scope Description + ======================= =================================================== + *none* The default: ``system``. + + Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``. + - ``agent`` and executed by a thread on the same + agent. + - ``workgroup`` and executed by a thread in the + same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``agent`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system`` or ``agent`` and executed by a thread + on the same agent. + - ``workgroup`` and executed by a thread in the + same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``workgroup`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``, ``agent`` or ``workgroup`` and + executed by a thread in the same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``wavefront`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``, ``agent``, ``workgroup`` or + ``wavefront`` and executed by a thread in the + same wavefront. + + ``singlethread`` Only synchronizes with, and participates in + modification and seq_cst total orderings with, + other operations (except image operations) running + in the same thread for all address spaces (for + example, in signal handlers). + + ``one-as`` Same as ``system`` but only synchronizes with other + operations within the same address space. + + ``agent-one-as`` Same as ``agent`` but only synchronizes with other + operations within the same address space. + + ``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with + other operations within the same address space. + + ``wavefront-one-as`` Same as ``wavefront`` but only synchronizes with + other operations within the same address space. + + ``singlethread-one-as`` Same as ``singlethread`` but only synchronizes with + other operations within the same address space. + ======================= =================================================== AMDGPU Intrinsics ----------------- diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h index 9c521899c95b..393250f7f8c6 100644 --- a/include/llvm-c/Core.h +++ b/include/llvm-c/Core.h @@ -2402,6 +2402,13 @@ LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn); */ void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn); +/** + * Obtain the intrinsic ID number which matches the given function name. + * + * @see llvm::Function::lookupIntrinsicID() + */ +unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen); + /** * Obtain the ID number from a function instance. * diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake index e30cb8a8ac23..1a38bc15ab9d 100644 --- a/include/llvm/Config/config.h.cmake +++ b/include/llvm/Config/config.h.cmake @@ -344,6 +344,9 @@ /* Whether GlobalISel rule coverage is being collected */ #cmakedefine01 LLVM_GISEL_COV_ENABLED +/* Define if we have z3 and want to build it */ +#cmakedefine LLVM_WITH_Z3 ${LLVM_WITH_Z3} + /* Define to the default GlobalISel coverage file prefix */ #cmakedefine LLVM_GISEL_COV_PREFIX "${LLVM_GISEL_COV_PREFIX}" diff --git a/include/llvm/Support/SMTAPI.h b/include/llvm/Support/SMTAPI.h new file mode 100644 index 000000000000..418c251d5ac4 --- /dev/null +++ b/include/llvm/Support/SMTAPI.h @@ -0,0 +1,405 @@ +//===- SMTAPI.h -------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a SMT generic Solver API, which will be the base class +// for every SMT solver specific class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_SMTAPI_H +#define LLVM_SUPPORT_SMTAPI_H + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { + +/// Generic base class for SMT sorts +class SMTSort { +public: + SMTSort() = default; + virtual ~SMTSort() = default; + + /// Returns true if the sort is a bitvector, calls isBitvectorSortImpl(). + virtual bool isBitvectorSort() const { return isBitvectorSortImpl(); } + + /// Returns true if the sort is a floating-point, calls isFloatSortImpl(). + virtual bool isFloatSort() const { return isFloatSortImpl(); } + + /// Returns true if the sort is a boolean, calls isBooleanSortImpl(). + virtual bool isBooleanSort() const { return isBooleanSortImpl(); } + + /// Returns the bitvector size, fails if the sort is not a bitvector + /// Calls getBitvectorSortSizeImpl(). + virtual unsigned getBitvectorSortSize() const { + assert(isBitvectorSort() && "Not a bitvector sort!"); + unsigned Size = getBitvectorSortSizeImpl(); + assert(Size && "Size is zero!"); + return Size; + }; + + /// Returns the floating-point size, fails if the sort is not a floating-point + /// Calls getFloatSortSizeImpl(). + virtual unsigned getFloatSortSize() const { + assert(isFloatSort() && "Not a floating-point sort!"); + unsigned Size = getFloatSortSizeImpl(); + assert(Size && "Size is zero!"); + return Size; + }; + + virtual void Profile(llvm::FoldingSetNodeID &ID) const = 0; + + bool operator<(const SMTSort &Other) const { + llvm::FoldingSetNodeID ID1, ID2; + Profile(ID1); + Other.Profile(ID2); + return ID1 < ID2; + } + + friend bool operator==(SMTSort const &LHS, SMTSort const &RHS) { + return LHS.equal_to(RHS); + } + + virtual void print(raw_ostream &OS) const = 0; + + LLVM_DUMP_METHOD void dump() const { print(llvm::errs()); } + +protected: + /// Query the SMT solver and returns true if two sorts are equal (same kind + /// and bit width). This does not check if the two sorts are the same objects. + virtual bool equal_to(SMTSort const &other) const = 0; + + /// Query the SMT solver and checks if a sort is bitvector. + virtual bool isBitvectorSortImpl() const = 0; + + /// Query the SMT solver and checks if a sort is floating-point. + virtual bool isFloatSortImpl() const = 0; + + /// Query the SMT solver and checks if a sort is boolean. + virtual bool isBooleanSortImpl() const = 0; + + /// Query the SMT solver and returns the sort bit width. + virtual unsigned getBitvectorSortSizeImpl() const = 0; + + /// Query the SMT solver and returns the sort bit width. + virtual unsigned getFloatSortSizeImpl() const = 0; +}; + +/// Shared pointer for SMTSorts, used by SMTSolver API. +using SMTSortRef = const SMTSort *; + +/// Generic base class for SMT exprs +class SMTExpr { +public: + SMTExpr() = default; + virtual ~SMTExpr() = default; + + bool operator<(const SMTExpr &Other) const { + llvm::FoldingSetNodeID ID1, ID2; + Profile(ID1); + Other.Profile(ID2); + return ID1 < ID2; + } + + virtual void Profile(llvm::FoldingSetNodeID &ID) const = 0; + + friend bool operator==(SMTExpr const &LHS, SMTExpr const &RHS) { + return LHS.equal_to(RHS); + } + + virtual void print(raw_ostream &OS) const = 0; + + LLVM_DUMP_METHOD void dump() const { print(llvm::errs()); } + +protected: + /// Query the SMT solver and returns true if two sorts are equal (same kind + /// and bit width). This does not check if the two sorts are the same objects. + virtual bool equal_to(SMTExpr const &other) const = 0; +}; + +/// Shared pointer for SMTExprs, used by SMTSolver API. +using SMTExprRef = const SMTExpr *; + +/// Generic base class for SMT Solvers +/// +/// This class is responsible for wrapping all sorts and expression generation, +/// through the mk* methods. It also provides methods to create SMT expressions +/// straight from clang's AST, through the from* methods. +class SMTSolver { +public: + SMTSolver() = default; + virtual ~SMTSolver() = default; + + LLVM_DUMP_METHOD void dump() const { print(llvm::errs()); } + + // Returns an appropriate floating-point sort for the given bitwidth. + SMTSortRef getFloatSort(unsigned BitWidth) { + switch (BitWidth) { + case 16: + return getFloat16Sort(); + case 32: + return getFloat32Sort(); + case 64: + return getFloat64Sort(); + case 128: + return getFloat128Sort(); + default:; + } + llvm_unreachable("Unsupported floating-point bitwidth!"); + } + + // Returns a boolean sort. + virtual SMTSortRef getBoolSort() = 0; + + // Returns an appropriate bitvector sort for the given bitwidth. + virtual SMTSortRef getBitvectorSort(const unsigned BitWidth) = 0; + + // Returns a floating-point sort of width 16 + virtual SMTSortRef getFloat16Sort() = 0; + + // Returns a floating-point sort of width 32 + virtual SMTSortRef getFloat32Sort() = 0; + + // Returns a floating-point sort of width 64 + virtual SMTSortRef getFloat64Sort() = 0; + + // Returns a floating-point sort of width 128 + virtual SMTSortRef getFloat128Sort() = 0; + + // Returns an appropriate sort for the given AST. + virtual SMTSortRef getSort(const SMTExprRef &AST) = 0; + + /// Given a constraint, adds it to the solver + virtual void addConstraint(const SMTExprRef &Exp) const = 0; + + /// Creates a bitvector addition operation + virtual SMTExprRef mkBVAdd(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector subtraction operation + virtual SMTExprRef mkBVSub(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector multiplication operation + virtual SMTExprRef mkBVMul(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed modulus operation + virtual SMTExprRef mkBVSRem(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned modulus operation + virtual SMTExprRef mkBVURem(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed division operation + virtual SMTExprRef mkBVSDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned division operation + virtual SMTExprRef mkBVUDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector logical shift left operation + virtual SMTExprRef mkBVShl(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector arithmetic shift right operation + virtual SMTExprRef mkBVAshr(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector logical shift right operation + virtual SMTExprRef mkBVLshr(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector negation operation + virtual SMTExprRef mkBVNeg(const SMTExprRef &Exp) = 0; + + /// Creates a bitvector not operation + virtual SMTExprRef mkBVNot(const SMTExprRef &Exp) = 0; + + /// Creates a bitvector xor operation + virtual SMTExprRef mkBVXor(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector or operation + virtual SMTExprRef mkBVOr(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector and operation + virtual SMTExprRef mkBVAnd(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned less-than operation + virtual SMTExprRef mkBVUlt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed less-than operation + virtual SMTExprRef mkBVSlt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned greater-than operation + virtual SMTExprRef mkBVUgt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed greater-than operation + virtual SMTExprRef mkBVSgt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned less-equal-than operation + virtual SMTExprRef mkBVUle(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed less-equal-than operation + virtual SMTExprRef mkBVSle(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector unsigned greater-equal-than operation + virtual SMTExprRef mkBVUge(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a bitvector signed greater-equal-than operation + virtual SMTExprRef mkBVSge(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a boolean not operation + virtual SMTExprRef mkNot(const SMTExprRef &Exp) = 0; + + /// Creates a boolean equality operation + virtual SMTExprRef mkEqual(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a boolean and operation + virtual SMTExprRef mkAnd(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a boolean or operation + virtual SMTExprRef mkOr(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a boolean ite operation + virtual SMTExprRef mkIte(const SMTExprRef &Cond, const SMTExprRef &T, + const SMTExprRef &F) = 0; + + /// Creates a bitvector sign extension operation + virtual SMTExprRef mkBVSignExt(unsigned i, const SMTExprRef &Exp) = 0; + + /// Creates a bitvector zero extension operation + virtual SMTExprRef mkBVZeroExt(unsigned i, const SMTExprRef &Exp) = 0; + + /// Creates a bitvector extract operation + virtual SMTExprRef mkBVExtract(unsigned High, unsigned Low, + const SMTExprRef &Exp) = 0; + + /// Creates a bitvector concat operation + virtual SMTExprRef mkBVConcat(const SMTExprRef &LHS, + const SMTExprRef &RHS) = 0; + + /// Creates a floating-point negation operation + virtual SMTExprRef mkFPNeg(const SMTExprRef &Exp) = 0; + + /// Creates a floating-point isInfinite operation + virtual SMTExprRef mkFPIsInfinite(const SMTExprRef &Exp) = 0; + + /// Creates a floating-point isNaN operation + virtual SMTExprRef mkFPIsNaN(const SMTExprRef &Exp) = 0; + + /// Creates a floating-point isNormal operation + virtual SMTExprRef mkFPIsNormal(const SMTExprRef &Exp) = 0; + + /// Creates a floating-point isZero operation + virtual SMTExprRef mkFPIsZero(const SMTExprRef &Exp) = 0; + + /// Creates a floating-point multiplication operation + virtual SMTExprRef mkFPMul(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point division operation + virtual SMTExprRef mkFPDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point remainder operation + virtual SMTExprRef mkFPRem(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point addition operation + virtual SMTExprRef mkFPAdd(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point subtraction operation + virtual SMTExprRef mkFPSub(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point less-than operation + virtual SMTExprRef mkFPLt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point greater-than operation + virtual SMTExprRef mkFPGt(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point less-than-or-equal operation + virtual SMTExprRef mkFPLe(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point greater-than-or-equal operation + virtual SMTExprRef mkFPGe(const SMTExprRef &LHS, const SMTExprRef &RHS) = 0; + + /// Creates a floating-point equality operation + virtual SMTExprRef mkFPEqual(const SMTExprRef &LHS, + const SMTExprRef &RHS) = 0; + + /// Creates a floating-point conversion from floatint-point to floating-point + /// operation + virtual SMTExprRef mkFPtoFP(const SMTExprRef &From, const SMTSortRef &To) = 0; + + /// Creates a floating-point conversion from signed bitvector to + /// floatint-point operation + virtual SMTExprRef mkSBVtoFP(const SMTExprRef &From, + const SMTSortRef &To) = 0; + + /// Creates a floating-point conversion from unsigned bitvector to + /// floatint-point operation + virtual SMTExprRef mkUBVtoFP(const SMTExprRef &From, + const SMTSortRef &To) = 0; + + /// Creates a floating-point conversion from floatint-point to signed + /// bitvector operation + virtual SMTExprRef mkFPtoSBV(const SMTExprRef &From, unsigned ToWidth) = 0; + + /// Creates a floating-point conversion from floatint-point to unsigned + /// bitvector operation + virtual SMTExprRef mkFPtoUBV(const SMTExprRef &From, unsigned ToWidth) = 0; + + /// Creates a new symbol, given a name and a sort + virtual SMTExprRef mkSymbol(const char *Name, SMTSortRef Sort) = 0; + + // Returns an appropriate floating-point rounding mode. + virtual SMTExprRef getFloatRoundingMode() = 0; + + // If the a model is available, returns the value of a given bitvector symbol + virtual llvm::APSInt getBitvector(const SMTExprRef &Exp, unsigned BitWidth, + bool isUnsigned) = 0; + + // If the a model is available, returns the value of a given boolean symbol + virtual bool getBoolean(const SMTExprRef &Exp) = 0; + + /// Constructs an SMTExprRef from a boolean. + virtual SMTExprRef mkBoolean(const bool b) = 0; + + /// Constructs an SMTExprRef from a finite APFloat. + virtual SMTExprRef mkFloat(const llvm::APFloat Float) = 0; + + /// Constructs an SMTExprRef from an APSInt and its bit width + virtual SMTExprRef mkBitvector(const llvm::APSInt Int, unsigned BitWidth) = 0; + + /// Given an expression, extract the value of this operand in the model. + virtual bool getInterpretation(const SMTExprRef &Exp, llvm::APSInt &Int) = 0; + + /// Given an expression extract the value of this operand in the model. + virtual bool getInterpretation(const SMTExprRef &Exp, + llvm::APFloat &Float) = 0; + + /// Check if the constraints are satisfiable + virtual Optional check() const = 0; + + /// Push the current solver state + virtual void push() = 0; + + /// Pop the previous solver state + virtual void pop(unsigned NumStates = 1) = 0; + + /// Reset the solver and remove all constraints. + virtual void reset() = 0; + + /// Checks if the solver supports floating-points. + virtual bool isFPSupported() = 0; + + virtual void print(raw_ostream &OS) const = 0; +}; + +/// Shared pointer for SMTSolvers. +using SMTSolverRef = std::shared_ptr; + +/// Convenience method to create and Z3Solver object +SMTSolverRef CreateZ3Solver(); + +} // namespace llvm + +#endif diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 0768d1175d7f..9d642ba245c9 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -297,6 +297,10 @@ class TypePromotionTransaction; /// DataLayout for the Function being processed. const DataLayout *DL = nullptr; + /// Building the dominator tree can be expensive, so we only build it + /// lazily and update it when required. + std::unique_ptr DT; + public: static char ID; // Pass identification, replacement for typeid @@ -335,6 +339,13 @@ class TypePromotionTransaction; } } + // Get the DominatorTree, building if necessary. + DominatorTree &getDT(Function &F) { + if (!DT) + DT = llvm::make_unique(F); + return *DT; + } + bool eliminateFallThrough(Function &F); bool eliminateMostlyEmptyBlocks(Function &F); BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); @@ -342,8 +353,8 @@ class TypePromotionTransaction; void eliminateMostlyEmptyBlock(BasicBlock *BB); bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, bool isPreheader); - bool optimizeBlock(BasicBlock &BB, DominatorTree &DT, bool &ModifiedDT); - bool optimizeInst(Instruction *I, DominatorTree &DT, bool &ModifiedDT); + bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); + bool optimizeInst(Instruction *I, bool &ModifiedDT); bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); bool optimizeInlineAsmInst(CallInst *CS); @@ -363,7 +374,7 @@ class TypePromotionTransaction; const SmallVectorImpl &Exts, SmallVectorImpl &ProfitablyMovedExts, unsigned CreatedInstsCost = 0); - bool mergeSExts(Function &F, DominatorTree &DT); + bool mergeSExts(Function &F); bool splitLargeGEPOffsets(); bool performAddressTypePromotion( Instruction *&Inst, @@ -375,12 +386,10 @@ class TypePromotionTransaction; bool tryToSinkFreeOperands(Instruction *I); bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, - Intrinsic::ID IID, DominatorTree &DT); - bool optimizeCmp(CmpInst *Cmp, DominatorTree &DT, bool &ModifiedDT); - bool combineToUSubWithOverflow(CmpInst *Cmp, DominatorTree &DT, - bool &ModifiedDT); - bool combineToUAddWithOverflow(CmpInst *Cmp, DominatorTree &DT, - bool &ModifiedDT); + Intrinsic::ID IID); + bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); + bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); + bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); }; } // end anonymous namespace @@ -459,18 +468,18 @@ bool CodeGenPrepare::runOnFunction(Function &F) { bool MadeChange = true; while (MadeChange) { MadeChange = false; - DominatorTree DT(F); + DT.reset(); for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; - MadeChange |= optimizeBlock(*BB, DT, ModifiedDTOnIteration); + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) break; } if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) - MadeChange |= mergeSExts(F, DT); + MadeChange |= mergeSExts(F); if (!LargeOffsetGEPMap.empty()) MadeChange |= splitLargeGEPOffsets(); @@ -1166,8 +1175,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, - Intrinsic::ID IID, - DominatorTree &DT) { + Intrinsic::ID IID) { // We allow matching the canonical IR (add X, C) back to (usubo X, -C). Value *Arg0 = BO->getOperand(0); Value *Arg1 = BO->getOperand(1); @@ -1186,8 +1194,8 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, } else { // The math and compare may be independent instructions. Check dominance to // determine the insertion point for the intrinsic. - bool MathDominates = DT.dominates(BO, Cmp); - if (!MathDominates && !DT.dominates(Cmp, BO)) + bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp); + if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO)) return false; BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent(); @@ -1251,7 +1259,7 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp, /// Try to combine the compare into a call to the llvm.uadd.with.overflow /// intrinsic. Return true if any changes were made. -bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, DominatorTree &DT, +bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT) { Value *A, *B; BinaryOperator *Add; @@ -1269,7 +1277,7 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, DominatorTree &DT, if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse()) return false; - if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow, DT)) + if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow)) return false; // Reset callers - do not crash by iterating over a dead instruction. @@ -1277,7 +1285,7 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, DominatorTree &DT, return true; } -bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, DominatorTree &DT, +bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT) { // We are not expecting non-canonical/degenerate code. Just bail out. Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); @@ -1330,7 +1338,7 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, DominatorTree &DT, TLI->getValueType(*DL, Sub->getType()))) return false; - if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow, DT)) + if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow)) return false; // Reset callers - do not crash by iterating over a dead instruction. @@ -1404,15 +1412,14 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { return MadeChange; } -bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, DominatorTree &DT, - bool &ModifiedDT) { +bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; - if (combineToUAddWithOverflow(Cmp, DT, ModifiedDT)) + if (combineToUAddWithOverflow(Cmp, ModifiedDT)) return true; - if (combineToUSubWithOverflow(Cmp, DT, ModifiedDT)) + if (combineToUSubWithOverflow(Cmp, ModifiedDT)) return true; return false; @@ -5223,7 +5230,7 @@ bool CodeGenPrepare::tryToPromoteExts( } /// Merging redundant sexts when one is dominating the other. -bool CodeGenPrepare::mergeSExts(Function &F, DominatorTree &DT) { +bool CodeGenPrepare::mergeSExts(Function &F) { bool Changed = false; for (auto &Entry : ValToSExtendedUses) { SExts &Insts = Entry.second; @@ -5234,7 +5241,7 @@ bool CodeGenPrepare::mergeSExts(Function &F, DominatorTree &DT) { continue; bool inserted = false; for (auto &Pt : CurPts) { - if (DT.dominates(Inst, Pt)) { + if (getDT(F).dominates(Inst, Pt)) { Pt->replaceAllUsesWith(Inst); RemovedInsts.insert(Pt); Pt->removeFromParent(); @@ -5243,7 +5250,7 @@ bool CodeGenPrepare::mergeSExts(Function &F, DominatorTree &DT) { Changed = true; break; } - if (!DT.dominates(Pt, Inst)) + if (!getDT(F).dominates(Pt, Inst)) // Give up if we need to merge in a common dominator as the // experiments show it is not profitable. continue; @@ -6880,8 +6887,7 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, return true; } -bool CodeGenPrepare::optimizeInst(Instruction *I, DominatorTree &DT, - bool &ModifiedDT) { +bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. if (InsertedInsts.count(I)) @@ -6932,7 +6938,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, DominatorTree &DT, } if (auto *Cmp = dyn_cast(I)) - if (TLI && optimizeCmp(Cmp, DT, ModifiedDT)) + if (TLI && optimizeCmp(Cmp, ModifiedDT)) return true; if (LoadInst *LI = dyn_cast(I)) { @@ -6994,7 +7000,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, DominatorTree &DT, GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; - optimizeInst(NC, DT, ModifiedDT); + optimizeInst(NC, ModifiedDT); return true; } if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { @@ -7043,14 +7049,13 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL, // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, DominatorTree &DT, - bool &ModifiedDT) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); while (CurInstIterator != BB.end()) { - MadeChange |= optimizeInst(&*CurInstIterator++, DT, ModifiedDT); + MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); if (ModifiedDT) return true; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6e807556d373..5dfc759b4c0f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1974,9 +1974,25 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, break; } - // We can always fold X == X for integer setcc's. - if (N1 == N2 && OpVT.isInteger()) - return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT); + if (OpVT.isInteger()) { + // For EQ and NE, we can always pick a value for the undef to make the + // predicate pass or fail, so we can return undef. + // Matches behavior in llvm::ConstantFoldCompareInstruction. + // icmp eq/ne X, undef -> undef. + if ((N1.isUndef() || N2.isUndef()) && + (Cond == ISD::SETEQ || Cond == ISD::SETNE)) + return getUNDEF(VT); + + // If both operands are undef, we can return undef for int comparison. + // icmp undef, undef -> undef. + if (N1.isUndef() && N2.isUndef()) + return getUNDEF(VT); + + // icmp X, X -> true/false + // icmp X, undef -> true/false because undef could be X. + if (N1 == N2) + return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT); + } if (ConstantSDNode *N2C = dyn_cast(N2)) { const APInt &C2 = N2C->getAPIntValue(); diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index 1b8ad4823f92..aa6bc542b4e9 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -2329,6 +2329,10 @@ const char *LLVMIntrinsicCopyOverloadedName(unsigned ID, return strdup(Str.c_str()); } +unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen) { + return Function::lookupIntrinsicID({Name, NameLen}); +} + LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) { auto IID = llvm_map_to_intrinsic_id(ID); return llvm::Intrinsic::isOverloaded(IID); diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 234db82a0c2a..b5246b27ca1d 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -44,6 +44,13 @@ if (MSVC) set (delayload_flags delayimp -delayload:shell32.dll -delayload:ole32.dll) endif() +# Link Z3 if the user wants to build it. +if(LLVM_WITH_Z3) + set(Z3_LINK_FILES ${Z3_LIBRARIES}) +else() + set(Z3_LINK_FILES "") +endif() + add_llvm_library(LLVMSupport AArch64TargetParser.cpp ARMTargetParser.cpp @@ -152,6 +159,7 @@ add_llvm_library(LLVMSupport regfree.c regstrlcpy.c xxhash.cpp + Z3Solver.cpp # System Atomic.cpp @@ -177,7 +185,14 @@ add_llvm_library(LLVMSupport ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support ${Backtrace_INCLUDE_DIRS} - LINK_LIBS ${system_libs} ${delayload_flags} + LINK_LIBS ${system_libs} ${delayload_flags} ${Z3_LINK_FILES} ) set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${system_libs}") + +if(LLVM_WITH_Z3) + target_include_directories(LLVMSupport SYSTEM + PRIVATE + ${Z3_INCLUDE_DIR} + ) +endif() diff --git a/lib/Support/Z3Solver.cpp b/lib/Support/Z3Solver.cpp new file mode 100644 index 000000000000..a8374522bda8 --- /dev/null +++ b/lib/Support/Z3Solver.cpp @@ -0,0 +1,826 @@ +//== Z3Solver.cpp -----------------------------------------------*- C++ -*--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Twine.h" +#include "llvm/Config/config.h" +#include "llvm/Support/SMTAPI.h" +#include + +using namespace llvm; + +#if LLVM_WITH_Z3 + +#include + +namespace { + +/// Configuration class for Z3 +class Z3Config { + friend class Z3Context; + + Z3_config Config; + +public: + Z3Config() : Config(Z3_mk_config()) { + // Enable model finding + Z3_set_param_value(Config, "model", "true"); + // Disable proof generation + Z3_set_param_value(Config, "proof", "false"); + // Set timeout to 15000ms = 15s + Z3_set_param_value(Config, "timeout", "15000"); + } + + ~Z3Config() { Z3_del_config(Config); } +}; // end class Z3Config + +// Function used to report errors +void Z3ErrorHandler(Z3_context Context, Z3_error_code Error) { + llvm::report_fatal_error("Z3 error: " + + llvm::Twine(Z3_get_error_msg(Context, Error))); +} + +/// Wrapper for Z3 context +class Z3Context { +public: + Z3_context Context; + + Z3Context() { + Context = Z3_mk_context_rc(Z3Config().Config); + // The error function is set here because the context is the first object + // created by the backend + Z3_set_error_handler(Context, Z3ErrorHandler); + } + + virtual ~Z3Context() { + Z3_del_context(Context); + Context = nullptr; + } +}; // end class Z3Context + +/// Wrapper for Z3 Sort +class Z3Sort : public SMTSort { + friend class Z3Solver; + + Z3Context &Context; + + Z3_sort Sort; + +public: + /// Default constructor, mainly used by make_shared + Z3Sort(Z3Context &C, Z3_sort ZS) : Context(C), Sort(ZS) { + Z3_inc_ref(Context.Context, reinterpret_cast(Sort)); + } + + /// Override implicit copy constructor for correct reference counting. + Z3Sort(const Z3Sort &Other) : Context(Other.Context), Sort(Other.Sort) { + Z3_inc_ref(Context.Context, reinterpret_cast(Sort)); + } + + /// Override implicit copy assignment constructor for correct reference + /// counting. + Z3Sort &operator=(const Z3Sort &Other) { + Z3_inc_ref(Context.Context, reinterpret_cast(Other.Sort)); + Z3_dec_ref(Context.Context, reinterpret_cast(Sort)); + Sort = Other.Sort; + return *this; + } + + Z3Sort(Z3Sort &&Other) = delete; + Z3Sort &operator=(Z3Sort &&Other) = delete; + + ~Z3Sort() { + if (Sort) + Z3_dec_ref(Context.Context, reinterpret_cast(Sort)); + } + + void Profile(llvm::FoldingSetNodeID &ID) const override { + ID.AddInteger( + Z3_get_ast_id(Context.Context, reinterpret_cast(Sort))); + } + + bool isBitvectorSortImpl() const override { + return (Z3_get_sort_kind(Context.Context, Sort) == Z3_BV_SORT); + } + + bool isFloatSortImpl() const override { + return (Z3_get_sort_kind(Context.Context, Sort) == Z3_FLOATING_POINT_SORT); + } + + bool isBooleanSortImpl() const override { + return (Z3_get_sort_kind(Context.Context, Sort) == Z3_BOOL_SORT); + } + + unsigned getBitvectorSortSizeImpl() const override { + return Z3_get_bv_sort_size(Context.Context, Sort); + } + + unsigned getFloatSortSizeImpl() const override { + return Z3_fpa_get_ebits(Context.Context, Sort) + + Z3_fpa_get_sbits(Context.Context, Sort); + } + + bool equal_to(SMTSort const &Other) const override { + return Z3_is_eq_sort(Context.Context, Sort, + static_cast(Other).Sort); + } + + void print(raw_ostream &OS) const override { + OS << Z3_sort_to_string(Context.Context, Sort); + } +}; // end class Z3Sort + +static const Z3Sort &toZ3Sort(const SMTSort &S) { + return static_cast(S); +} + +class Z3Expr : public SMTExpr { + friend class Z3Solver; + + Z3Context &Context; + + Z3_ast AST; + +public: + Z3Expr(Z3Context &C, Z3_ast ZA) : SMTExpr(), Context(C), AST(ZA) { + Z3_inc_ref(Context.Context, AST); + } + + /// Override implicit copy constructor for correct reference counting. + Z3Expr(const Z3Expr &Copy) : SMTExpr(), Context(Copy.Context), AST(Copy.AST) { + Z3_inc_ref(Context.Context, AST); + } + + /// Override implicit copy assignment constructor for correct reference + /// counting. + Z3Expr &operator=(const Z3Expr &Other) { + Z3_inc_ref(Context.Context, Other.AST); + Z3_dec_ref(Context.Context, AST); + AST = Other.AST; + return *this; + } + + Z3Expr(Z3Expr &&Other) = delete; + Z3Expr &operator=(Z3Expr &&Other) = delete; + + ~Z3Expr() { + if (AST) + Z3_dec_ref(Context.Context, AST); + } + + void Profile(llvm::FoldingSetNodeID &ID) const override { + ID.AddInteger(Z3_get_ast_id(Context.Context, AST)); + } + + /// Comparison of AST equality, not model equivalence. + bool equal_to(SMTExpr const &Other) const override { + assert(Z3_is_eq_sort(Context.Context, Z3_get_sort(Context.Context, AST), + Z3_get_sort(Context.Context, + static_cast(Other).AST)) && + "AST's must have the same sort"); + return Z3_is_eq_ast(Context.Context, AST, + static_cast(Other).AST); + } + + void print(raw_ostream &OS) const override { + OS << Z3_ast_to_string(Context.Context, AST); + } +}; // end class Z3Expr + +static const Z3Expr &toZ3Expr(const SMTExpr &E) { + return static_cast(E); +} + +class Z3Model { + friend class Z3Solver; + + Z3Context &Context; + + Z3_model Model; + +public: + Z3Model(Z3Context &C, Z3_model ZM) : Context(C), Model(ZM) { + Z3_model_inc_ref(Context.Context, Model); + } + + Z3Model(const Z3Model &Other) = delete; + Z3Model(Z3Model &&Other) = delete; + Z3Model &operator=(Z3Model &Other) = delete; + Z3Model &operator=(Z3Model &&Other) = delete; + + ~Z3Model() { + if (Model) + Z3_model_dec_ref(Context.Context, Model); + } + + void print(raw_ostream &OS) const { + OS << Z3_model_to_string(Context.Context, Model); + } + + LLVM_DUMP_METHOD void dump() const { print(llvm::errs()); } +}; // end class Z3Model + +/// Get the corresponding IEEE floating-point type for a given bitwidth. +static const llvm::fltSemantics &getFloatSemantics(unsigned BitWidth) { + switch (BitWidth) { + default: + llvm_unreachable("Unsupported floating-point semantics!"); + break; + case 16: + return llvm::APFloat::IEEEhalf(); + case 32: + return llvm::APFloat::IEEEsingle(); + case 64: + return llvm::APFloat::IEEEdouble(); + case 128: + return llvm::APFloat::IEEEquad(); + } +} + +// Determine whether two float semantics are equivalent +static bool areEquivalent(const llvm::fltSemantics &LHS, + const llvm::fltSemantics &RHS) { + return (llvm::APFloat::semanticsPrecision(LHS) == + llvm::APFloat::semanticsPrecision(RHS)) && + (llvm::APFloat::semanticsMinExponent(LHS) == + llvm::APFloat::semanticsMinExponent(RHS)) && + (llvm::APFloat::semanticsMaxExponent(LHS) == + llvm::APFloat::semanticsMaxExponent(RHS)) && + (llvm::APFloat::semanticsSizeInBits(LHS) == + llvm::APFloat::semanticsSizeInBits(RHS)); +} + +class Z3Solver : public SMTSolver { + friend class Z3ConstraintManager; + + Z3Context Context; + + Z3_solver Solver; + + // Cache Sorts + std::set CachedSorts; + + // Cache Exprs + std::set CachedExprs; + +public: + Z3Solver() : Solver(Z3_mk_simple_solver(Context.Context)) { + Z3_solver_inc_ref(Context.Context, Solver); + } + + Z3Solver(const Z3Solver &Other) = delete; + Z3Solver(Z3Solver &&Other) = delete; + Z3Solver &operator=(Z3Solver &Other) = delete; + Z3Solver &operator=(Z3Solver &&Other) = delete; + + ~Z3Solver() { + if (Solver) + Z3_solver_dec_ref(Context.Context, Solver); + } + + void addConstraint(const SMTExprRef &Exp) const override { + Z3_solver_assert(Context.Context, Solver, toZ3Expr(*Exp).AST); + } + + // Given an SMTSort, adds/retrives it from the cache and returns + // an SMTSortRef to the SMTSort in the cache + SMTSortRef newSortRef(const SMTSort &Sort) { + auto It = CachedSorts.insert(toZ3Sort(Sort)); + return &(*It.first); + } + + // Given an SMTExpr, adds/retrives it from the cache and returns + // an SMTExprRef to the SMTExpr in the cache + SMTExprRef newExprRef(const SMTExpr &Exp) { + auto It = CachedExprs.insert(toZ3Expr(Exp)); + return &(*It.first); + } + + SMTSortRef getBoolSort() override { + return newSortRef(Z3Sort(Context, Z3_mk_bool_sort(Context.Context))); + } + + SMTSortRef getBitvectorSort(unsigned BitWidth) override { + return newSortRef( + Z3Sort(Context, Z3_mk_bv_sort(Context.Context, BitWidth))); + } + + SMTSortRef getSort(const SMTExprRef &Exp) override { + return newSortRef( + Z3Sort(Context, Z3_get_sort(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTSortRef getFloat16Sort() override { + return newSortRef(Z3Sort(Context, Z3_mk_fpa_sort_16(Context.Context))); + } + + SMTSortRef getFloat32Sort() override { + return newSortRef(Z3Sort(Context, Z3_mk_fpa_sort_32(Context.Context))); + } + + SMTSortRef getFloat64Sort() override { + return newSortRef(Z3Sort(Context, Z3_mk_fpa_sort_64(Context.Context))); + } + + SMTSortRef getFloat128Sort() override { + return newSortRef(Z3Sort(Context, Z3_mk_fpa_sort_128(Context.Context))); + } + + SMTExprRef mkBVNeg(const SMTExprRef &Exp) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvneg(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkBVNot(const SMTExprRef &Exp) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvnot(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkNot(const SMTExprRef &Exp) override { + return newExprRef( + Z3Expr(Context, Z3_mk_not(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkBVAdd(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvadd(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSub(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsub(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVMul(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvmul(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSRem(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsrem(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVURem(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvurem(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsdiv(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVUDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvudiv(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVShl(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvshl(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVAshr(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvashr(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVLshr(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvlshr(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVXor(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvxor(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVOr(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvor(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVAnd(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvand(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVUlt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvult(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSlt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvslt(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVUgt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvugt(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSgt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsgt(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVUle(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvule(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSle(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsle(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVUge(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvuge(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkBVSge(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_bvsge(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkAnd(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + Z3_ast Args[2] = {toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST}; + return newExprRef(Z3Expr(Context, Z3_mk_and(Context.Context, 2, Args))); + } + + SMTExprRef mkOr(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + Z3_ast Args[2] = {toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST}; + return newExprRef(Z3Expr(Context, Z3_mk_or(Context.Context, 2, Args))); + } + + SMTExprRef mkEqual(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_eq(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPNeg(const SMTExprRef &Exp) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_neg(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkFPIsInfinite(const SMTExprRef &Exp) override { + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_is_infinite(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkFPIsNaN(const SMTExprRef &Exp) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_is_nan(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkFPIsNormal(const SMTExprRef &Exp) override { + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_is_normal(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkFPIsZero(const SMTExprRef &Exp) override { + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_is_zero(Context.Context, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkFPMul(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef( + Z3Expr(Context, + Z3_mk_fpa_mul(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST))); + } + + SMTExprRef mkFPDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef( + Z3Expr(Context, + Z3_mk_fpa_div(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST))); + } + + SMTExprRef mkFPRem(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_rem(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPAdd(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef( + Z3Expr(Context, + Z3_mk_fpa_add(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST))); + } + + SMTExprRef mkFPSub(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef( + Z3Expr(Context, + Z3_mk_fpa_sub(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST))); + } + + SMTExprRef mkFPLt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_lt(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPGt(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_gt(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPLe(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_leq(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPGe(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_geq(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPEqual(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_fpa_eq(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkIte(const SMTExprRef &Cond, const SMTExprRef &T, + const SMTExprRef &F) override { + return newExprRef( + Z3Expr(Context, Z3_mk_ite(Context.Context, toZ3Expr(*Cond).AST, + toZ3Expr(*T).AST, toZ3Expr(*F).AST))); + } + + SMTExprRef mkBVSignExt(unsigned i, const SMTExprRef &Exp) override { + return newExprRef(Z3Expr( + Context, Z3_mk_sign_ext(Context.Context, i, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkBVZeroExt(unsigned i, const SMTExprRef &Exp) override { + return newExprRef(Z3Expr( + Context, Z3_mk_zero_ext(Context.Context, i, toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkBVExtract(unsigned High, unsigned Low, + const SMTExprRef &Exp) override { + return newExprRef(Z3Expr(Context, Z3_mk_extract(Context.Context, High, Low, + toZ3Expr(*Exp).AST))); + } + + SMTExprRef mkBVConcat(const SMTExprRef &LHS, const SMTExprRef &RHS) override { + return newExprRef( + Z3Expr(Context, Z3_mk_concat(Context.Context, toZ3Expr(*LHS).AST, + toZ3Expr(*RHS).AST))); + } + + SMTExprRef mkFPtoFP(const SMTExprRef &From, const SMTSortRef &To) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef(Z3Expr( + Context, + Z3_mk_fpa_to_fp_float(Context.Context, toZ3Expr(*RoundingMode).AST, + toZ3Expr(*From).AST, toZ3Sort(*To).Sort))); + } + + SMTExprRef mkSBVtoFP(const SMTExprRef &From, const SMTSortRef &To) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef(Z3Expr( + Context, + Z3_mk_fpa_to_fp_signed(Context.Context, toZ3Expr(*RoundingMode).AST, + toZ3Expr(*From).AST, toZ3Sort(*To).Sort))); + } + + SMTExprRef mkUBVtoFP(const SMTExprRef &From, const SMTSortRef &To) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef(Z3Expr( + Context, + Z3_mk_fpa_to_fp_unsigned(Context.Context, toZ3Expr(*RoundingMode).AST, + toZ3Expr(*From).AST, toZ3Sort(*To).Sort))); + } + + SMTExprRef mkFPtoSBV(const SMTExprRef &From, unsigned ToWidth) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_to_sbv(Context.Context, toZ3Expr(*RoundingMode).AST, + toZ3Expr(*From).AST, ToWidth))); + } + + SMTExprRef mkFPtoUBV(const SMTExprRef &From, unsigned ToWidth) override { + SMTExprRef RoundingMode = getFloatRoundingMode(); + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_to_ubv(Context.Context, toZ3Expr(*RoundingMode).AST, + toZ3Expr(*From).AST, ToWidth))); + } + + SMTExprRef mkBoolean(const bool b) override { + return newExprRef(Z3Expr(Context, b ? Z3_mk_true(Context.Context) + : Z3_mk_false(Context.Context))); + } + + SMTExprRef mkBitvector(const llvm::APSInt Int, unsigned BitWidth) override { + const SMTSortRef Sort = getBitvectorSort(BitWidth); + return newExprRef( + Z3Expr(Context, Z3_mk_numeral(Context.Context, Int.toString(10).c_str(), + toZ3Sort(*Sort).Sort))); + } + + SMTExprRef mkFloat(const llvm::APFloat Float) override { + SMTSortRef Sort = + getFloatSort(llvm::APFloat::semanticsSizeInBits(Float.getSemantics())); + + llvm::APSInt Int = llvm::APSInt(Float.bitcastToAPInt(), false); + SMTExprRef Z3Int = mkBitvector(Int, Int.getBitWidth()); + return newExprRef(Z3Expr( + Context, Z3_mk_fpa_to_fp_bv(Context.Context, toZ3Expr(*Z3Int).AST, + toZ3Sort(*Sort).Sort))); + } + + SMTExprRef mkSymbol(const char *Name, SMTSortRef Sort) override { + return newExprRef( + Z3Expr(Context, Z3_mk_const(Context.Context, + Z3_mk_string_symbol(Context.Context, Name), + toZ3Sort(*Sort).Sort))); + } + + llvm::APSInt getBitvector(const SMTExprRef &Exp, unsigned BitWidth, + bool isUnsigned) override { + return llvm::APSInt( + llvm::APInt(BitWidth, + Z3_get_numeral_string(Context.Context, toZ3Expr(*Exp).AST), + 10), + isUnsigned); + } + + bool getBoolean(const SMTExprRef &Exp) override { + return Z3_get_bool_value(Context.Context, toZ3Expr(*Exp).AST) == Z3_L_TRUE; + } + + SMTExprRef getFloatRoundingMode() override { + // TODO: Don't assume nearest ties to even rounding mode + return newExprRef(Z3Expr(Context, Z3_mk_fpa_rne(Context.Context))); + } + + bool toAPFloat(const SMTSortRef &Sort, const SMTExprRef &AST, + llvm::APFloat &Float, bool useSemantics) { + assert(Sort->isFloatSort() && "Unsupported sort to floating-point!"); + + llvm::APSInt Int(Sort->getFloatSortSize(), true); + const llvm::fltSemantics &Semantics = + getFloatSemantics(Sort->getFloatSortSize()); + SMTSortRef BVSort = getBitvectorSort(Sort->getFloatSortSize()); + if (!toAPSInt(BVSort, AST, Int, true)) { + return false; + } + + if (useSemantics && !areEquivalent(Float.getSemantics(), Semantics)) { + assert(false && "Floating-point types don't match!"); + return false; + } + + Float = llvm::APFloat(Semantics, Int); + return true; + } + + bool toAPSInt(const SMTSortRef &Sort, const SMTExprRef &AST, + llvm::APSInt &Int, bool useSemantics) { + if (Sort->isBitvectorSort()) { + if (useSemantics && Int.getBitWidth() != Sort->getBitvectorSortSize()) { + assert(false && "Bitvector types don't match!"); + return false; + } + + // FIXME: This function is also used to retrieve floating-point values, + // which can be 16, 32, 64 or 128 bits long. Bitvectors can be anything + // between 1 and 64 bits long, which is the reason we have this weird + // guard. In the future, we need proper calls in the backend to retrieve + // floating-points and its special values (NaN, +/-infinity, +/-zero), + // then we can drop this weird condition. + if (Sort->getBitvectorSortSize() <= 64 || + Sort->getBitvectorSortSize() == 128) { + Int = getBitvector(AST, Int.getBitWidth(), Int.isUnsigned()); + return true; + } + + assert(false && "Bitwidth not supported!"); + return false; + } + + if (Sort->isBooleanSort()) { + if (useSemantics && Int.getBitWidth() < 1) { + assert(false && "Boolean type doesn't match!"); + return false; + } + + Int = llvm::APSInt(llvm::APInt(Int.getBitWidth(), getBoolean(AST)), + Int.isUnsigned()); + return true; + } + + llvm_unreachable("Unsupported sort to integer!"); + } + + bool getInterpretation(const SMTExprRef &Exp, llvm::APSInt &Int) override { + Z3Model Model(Context, Z3_solver_get_model(Context.Context, Solver)); + Z3_func_decl Func = Z3_get_app_decl( + Context.Context, Z3_to_app(Context.Context, toZ3Expr(*Exp).AST)); + if (Z3_model_has_interp(Context.Context, Model.Model, Func) != Z3_L_TRUE) + return false; + + SMTExprRef Assign = newExprRef( + Z3Expr(Context, + Z3_model_get_const_interp(Context.Context, Model.Model, Func))); + SMTSortRef Sort = getSort(Assign); + return toAPSInt(Sort, Assign, Int, true); + } + + bool getInterpretation(const SMTExprRef &Exp, llvm::APFloat &Float) override { + Z3Model Model(Context, Z3_solver_get_model(Context.Context, Solver)); + Z3_func_decl Func = Z3_get_app_decl( + Context.Context, Z3_to_app(Context.Context, toZ3Expr(*Exp).AST)); + if (Z3_model_has_interp(Context.Context, Model.Model, Func) != Z3_L_TRUE) + return false; + + SMTExprRef Assign = newExprRef( + Z3Expr(Context, + Z3_model_get_const_interp(Context.Context, Model.Model, Func))); + SMTSortRef Sort = getSort(Assign); + return toAPFloat(Sort, Assign, Float, true); + } + + Optional check() const override { + Z3_lbool res = Z3_solver_check(Context.Context, Solver); + if (res == Z3_L_TRUE) + return true; + + if (res == Z3_L_FALSE) + return false; + + return Optional(); + } + + void push() override { return Z3_solver_push(Context.Context, Solver); } + + void pop(unsigned NumStates = 1) override { + assert(Z3_solver_get_num_scopes(Context.Context, Solver) >= NumStates); + return Z3_solver_pop(Context.Context, Solver, NumStates); + } + + bool isFPSupported() override { return true; } + + /// Reset the solver and remove all constraints. + void reset() override { Z3_solver_reset(Context.Context, Solver); } + + void print(raw_ostream &OS) const override { + OS << Z3_solver_to_string(Context.Context, Solver); + } +}; // end class Z3Solver + +} // end anonymous namespace + +#endif + +llvm::SMTSolverRef llvm::CreateZ3Solver() { +#if LLVM_WITH_Z3 + return llvm::make_unique(); +#else + llvm::report_fatal_error("LLVM was not compiled with Z3 support, rebuild " + "with -DLLVM_ENABLE_Z3_SOLVER=ON", + false); + return nullptr; +#endif +} diff --git a/lib/Target/AArch64/AArch64ExpandImm.cpp b/lib/Target/AArch64/AArch64ExpandImm.cpp index c8602dabcd54..c764af80eb86 100644 --- a/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -318,8 +318,11 @@ void expandMOVImm(uint64_t Imm, unsigned BitSize, ZeroChunks++; } - // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov" - // alias. + // Prefer MOVZ/MOVN over ORR because of the rules for the "mov" alias. + if ((BitSize / 16) - OneChunks <= 1 || (BitSize / 16) - ZeroChunks <= 1) { + expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); + return; + } // Try a single ORR. uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 1870c9ca5fa9..4d9f08b3af01 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -23,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + SystemOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("one-as"); + AgentOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("agent-one-as"); + WorkgroupOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("workgroup-one-as"); + WavefrontOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("wavefront-one-as"); + SingleThreadOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("singlethread-one-as"); } } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index e6d4268a66e1..2b0b8b42acfe 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -29,12 +29,22 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { // All supported memory/synchronization scopes can be found here: // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes - /// Agent synchronization scope ID. + /// Agent synchronization scope ID (cross address space). SyncScope::ID AgentSSID; - /// Workgroup synchronization scope ID. + /// Workgroup synchronization scope ID (cross address space). SyncScope::ID WorkgroupSSID; - /// Wavefront synchronization scope ID. + /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// System synchronization scope ID (single address space). + SyncScope::ID SystemOneAddressSpaceSSID; + /// Agent synchronization scope ID (single address space). + SyncScope::ID AgentOneAddressSpaceSSID; + /// Workgroup synchronization scope ID (single address space). + SyncScope::ID WorkgroupOneAddressSpaceSSID; + /// Wavefront synchronization scope ID (single address space). + SyncScope::ID WavefrontOneAddressSpaceSSID; + /// Single thread synchronization scope ID (single address space). + SyncScope::ID SingleThreadOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -43,35 +53,70 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not /// supported by the AMDGPU target. Optional getSyncScopeInclusionOrdering(SyncScope::ID SSID) const { - if (SSID == SyncScope::SingleThread) + if (SSID == SyncScope::SingleThread || + SSID == getSingleThreadOneAddressSpaceSSID()) return 0; - else if (SSID == getWavefrontSSID()) + else if (SSID == getWavefrontSSID() || + SSID == getWavefrontOneAddressSpaceSSID()) return 1; - else if (SSID == getWorkgroupSSID()) + else if (SSID == getWorkgroupSSID() || + SSID == getWorkgroupOneAddressSpaceSSID()) return 2; - else if (SSID == getAgentSSID()) + else if (SSID == getAgentSSID() || + SSID == getAgentOneAddressSpaceSSID()) return 3; - else if (SSID == SyncScope::System) + else if (SSID == SyncScope::System || + SSID == getSystemOneAddressSpaceSSID()) return 4; return None; } + /// \returns True if \p SSID is restricted to single address space, false + /// otherwise + bool isOneAddressSpace(SyncScope::ID SSID) const { + return SSID == getSingleThreadOneAddressSpaceSSID() || + SSID == getWavefrontOneAddressSpaceSSID() || + SSID == getWorkgroupOneAddressSpaceSSID() || + SSID == getAgentOneAddressSpaceSSID() || + SSID == getSystemOneAddressSpaceSSID(); + } + public: AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); - /// \returns Agent synchronization scope ID. + /// \returns Agent synchronization scope ID (cross address space). SyncScope::ID getAgentSSID() const { return AgentSSID; } - /// \returns Workgroup synchronization scope ID. + /// \returns Workgroup synchronization scope ID (cross address space). SyncScope::ID getWorkgroupSSID() const { return WorkgroupSSID; } - /// \returns Wavefront synchronization scope ID. + /// \returns Wavefront synchronization scope ID (cross address space). SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns System synchronization scope ID (single address space). + SyncScope::ID getSystemOneAddressSpaceSSID() const { + return SystemOneAddressSpaceSSID; + } + /// \returns Agent synchronization scope ID (single address space). + SyncScope::ID getAgentOneAddressSpaceSSID() const { + return AgentOneAddressSpaceSSID; + } + /// \returns Workgroup synchronization scope ID (single address space). + SyncScope::ID getWorkgroupOneAddressSpaceSSID() const { + return WorkgroupOneAddressSpaceSSID; + } + /// \returns Wavefront synchronization scope ID (single address space). + SyncScope::ID getWavefrontOneAddressSpaceSSID() const { + return WavefrontOneAddressSpaceSSID; + } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { + return SingleThreadOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -87,7 +132,11 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { if (!AIO || !BIO) return None; - return AIO.getValue() > BIO.getValue(); + bool IsAOneAddressSpace = isOneAddressSpace(A); + bool IsBOneAddressSpace = isOneAddressSpace(B); + + return AIO.getValue() >= BIO.getValue() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 869deb93679c..b0ab7032d975 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -172,12 +172,14 @@ def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0)> { let isAsCheapAsAMove = 1; let isTerminator = 1; + let hasSideEffects = 0; } def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0, SSrc_b64:$src1)> { let isAsCheapAsAMove = 1; let isTerminator = 1; + let hasSideEffects = 0; let Defs = [SCC]; } @@ -185,6 +187,7 @@ def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0, SSrc_b64:$src1)> { let isAsCheapAsAMove = 1; let isTerminator = 1; + let hasSideEffects = 0; } def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 1080332d6e4a..ba7ca691f2ab 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -417,35 +417,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, Optional> SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const { - /// TODO: For now assume OpenCL memory model which treats each - /// address space as having a separate happens-before relation, and - /// so an instruction only has ordering with respect to the address - /// space it accesses, and if it accesses multiple address spaces it - /// does not require ordering of operations in different address - /// spaces. - if (SSID == SyncScope::System) + if (SSID == SyncScope::System) + return std::make_tuple(SIAtomicScope::SYSTEM, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getAgentSSID()) + return std::make_tuple(SIAtomicScope::AGENT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWorkgroupSSID()) + return std::make_tuple(SIAtomicScope::WORKGROUP, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWavefrontSSID()) + return std::make_tuple(SIAtomicScope::WAVEFRONT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == SyncScope::SingleThread) + return std::make_tuple(SIAtomicScope::SINGLETHREAD, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getSystemOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getAgentSSID()) + if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWorkgroupSSID()) + if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWavefrontSSID()) + if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == SyncScope::SingleThread) + if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - /// TODO: To support HSA Memory Model need to add additional memory - /// scopes that specify that do require cross address space - /// ordering. return None; } @@ -721,13 +732,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, bool VMCnt = false; bool LGKMCnt = false; - bool EXPCnt = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - VMCnt = true; + VMCnt |= true; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -751,7 +761,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/GDS memory as LDS operations // could be reordered with respect to later global/GDS memory // operations of the same wave. - LGKMCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -773,7 +783,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/LDS memory as GDS operations // could be reordered with respect to later global/LDS memory // operations of the same wave. - EXPCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -786,11 +796,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt || EXPCnt) { + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), - EXPCnt ? 0 : getExpcntBitMask(IV), + getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); Changed = true; diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index bc30b29a396f..21eecb1007f6 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -76,7 +76,7 @@ static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { } static bool isFullExecCopy(const MachineInstr& MI) { - return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC; + return MI.getOperand(1).getReg() == AMDGPU::EXEC; } static unsigned getOrNonExecReg(const MachineInstr &MI, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3346d303a1e4..8e49b647aa7e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -575,7 +575,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) - SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0, false); + SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); if (SOffset == AMDGPU::NoRegister) { // There are no free SGPRs, and since we are in the process of spilling diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 12a2d7a1d5e7..bb222edbceff 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1176,7 +1176,8 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { SmallVector Accesses; - if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses)) { + if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses) && + Accesses.size() == 1) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); @@ -1396,7 +1397,8 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { SmallVector Accesses; - if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses)) { + if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses) && + Accesses.size() == 1) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 36df387cd1b4..0a1289b12aac 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -9015,18 +9015,23 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); else BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); } BB->addSuccessor(loopMBB); @@ -9274,7 +9279,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .add(MI.getOperand(2)) // Rn .add(MI.getOperand(3)) // PredImm .add(MI.getOperand(4)) // PredReg - .add(MI.getOperand(0)); // Rt + .add(MI.getOperand(0)) // Rt + .cloneMemRefs(MI); MI.eraseFromParent(); return BB; } diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 13b5445eaba8..21aa3e0ab34b 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -173,12 +173,14 @@ namespace { MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs); + ArrayRef> Regs, + ArrayRef Instrs); MachineInstr *CreateLoadStoreDouble( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) const; + ArrayRef> Regs, + ArrayRef Instrs) const; void FormCandidates(const MemOpQueue &MemOps); MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, @@ -622,7 +624,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) { + ArrayRef> Regs, + ArrayRef Instrs) { unsigned NumRegs = Regs.size(); assert(NumRegs > 1); @@ -814,6 +817,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti( for (const std::pair &R : Regs) MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); + MIB.cloneMergedMemRefs(Instrs); + return MIB.getInstr(); } @@ -821,7 +826,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL, - ArrayRef> Regs) const { + ArrayRef> Regs, + ArrayRef Instrs) const { bool IsLoad = isi32Load(Opcode); assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; @@ -837,6 +843,7 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble( .addReg(Regs[1].first, getKillRegState(Regs[1].second)); } MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + MIB.cloneMergedMemRefs(Instrs); return MIB.getInstr(); } @@ -894,10 +901,11 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { MachineInstr *Merged = nullptr; if (Cand.CanMergeToLSDouble) Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, - Opcode, Pred, PredReg, DL, Regs); + Opcode, Pred, PredReg, DL, Regs, + Cand.Instrs); if (!Merged && Cand.CanMergeToLSMulti) Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, - Opcode, Pred, PredReg, DL, Regs); + Opcode, Pred, PredReg, DL, Regs, Cand.Instrs); if (!Merged) return nullptr; @@ -1435,14 +1443,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base, getKillRegState(isLd ? BaseKill : false)) .addImm(Pred).addReg(PredReg) .addReg(MO.getReg(), (isLd ? getDefRegState(true) : - getKillRegState(MO.isKill()))); + getKillRegState(MO.isKill()))) + .cloneMemRefs(*MI); } else if (isLd) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg) + .cloneMemRefs(*MI); } else { int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) @@ -1450,7 +1460,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base) .addReg(0) .addImm(Imm) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } else { // t2LDR_PRE, t2LDR_POST @@ -1458,7 +1469,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base, RegState::Define) .addReg(Base) .addImm(Offset) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } else { MachineOperand &MO = MI->getOperand(0); @@ -1473,14 +1485,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { .addReg(Base) .addReg(0) .addImm(Imm) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } else { // t2STR_PRE, t2STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base) .addImm(Offset) - .add(predOps(Pred, PredReg)); + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); } } MBB.erase(MBBI); @@ -1540,7 +1554,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { // Transfer implicit operands. for (const MachineOperand &MO : MI.implicit_operands()) MIB.add(MO); - MIB.setMemRefs(MI.memoperands()); + MIB.cloneMemRefs(MI); MBB.erase(MBBI); return true; @@ -1608,19 +1622,26 @@ static void InsertLDR_STR(MachineBasicBlock &MBB, bool isDef, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, ARMCC::CondCodes Pred, - unsigned PredReg, const TargetInstrInfo *TII) { + unsigned PredReg, const TargetInstrInfo *TII, + MachineInstr *MI) { if (isDef) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + // FIXME: This is overly conservative; the new instruction accesses 4 + // bytes, not 8. + MIB.cloneMemRefs(*MI); } else { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + // FIXME: This is overly conservative; the new instruction accesses 4 + // bytes, not 8. + MIB.cloneMemRefs(*MI); } } @@ -1678,7 +1699,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, .addReg(BaseReg, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg) .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) - .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)) + .cloneMemRefs(*MI); ++NumLDRD2LDM; } else { BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) @@ -1687,7 +1709,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, .addReg(EvenReg, getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) .addReg(OddReg, - getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)) + .cloneMemRefs(*MI); ++NumSTRD2STM; } } else { @@ -1705,9 +1728,10 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if (isLd && TRI->regsOverlap(EvenReg, BaseReg)) { assert(!TRI->regsOverlap(OddReg, BaseReg)); InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill, - false, BaseReg, false, BaseUndef, Pred, PredReg, TII); + false, BaseReg, false, BaseUndef, Pred, PredReg, TII, MI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill, - false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII); + false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII, + MI); } else { if (OddReg == EvenReg && EvenDeadKill) { // If the two source operands are the same, the kill marker is @@ -1720,9 +1744,11 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if (EvenReg == BaseReg) EvenDeadKill = false; InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill, - EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII); + EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII, + MI); InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill, - OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII); + OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII, + MI); } if (isLd) ++NumLDRD2LDR; diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 75db48928c9c..50d2d195a3f8 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2949,13 +2949,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); SMRange EmptyRange = None; // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); - + MatchFPUWaitAlias(IDLoc, static_cast(*Operands[0]), Operands, + Out, MatchingInlineAsm); + X86Operand &Op = static_cast(*Operands[0]); bool WasOriginallyInvalidOperand = false; unsigned Prefixes = getPrefixes(Operands); @@ -2992,6 +2992,11 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_MnemonicFail: break; } + if (Op.getToken().empty()) { + Error(IDLoc, "instruction must have size higher than 0", EmptyRange, + MatchingInlineAsm); + return true; + } // FIXME: Ideally, we would only attempt suffix matches for things which are // valid prefixes, and we could just infer the right unambiguous @@ -3127,15 +3132,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - StringRef Mnemonic = Op.getToken(); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = (static_cast(*Operands[0])).getToken(); SMRange EmptyRange = None; - StringRef Base = Op.getToken(); + StringRef Base = (static_cast(*Operands[0])).getToken(); unsigned Prefixes = getPrefixes(Operands); // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + MatchFPUWaitAlias(IDLoc, static_cast(*Operands[0]), Operands, Out, MatchingInlineAsm); + X86Operand &Op = static_cast(*Operands[0]); MCInst Inst; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index e08914c536e3..38c1c6ba8d0e 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3086,7 +3086,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // The 'X' was originally truncated. Do that now. if (XVT != NVT) { - insertDAGNode(*CurDAG, OrigNBits, Extract); + insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); } @@ -4039,8 +4039,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } - // FIXME: We should be able to fold loads here. - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); SDValue Reg = N0.getOperand(0); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 983b8fa4af1f..9176b08f3895 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -674,13 +674,6 @@ class BoUpSLP { /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); - /// \reorder commutative operands in alt shuffle if they result in - /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right); - /// \reorder commutative operands to get better probability of /// generating vectorized code. void reorderInputsAccordingToOpcode(const InstructionsState &S, @@ -2072,7 +2065,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); + reorderInputsAccordingToOpcode(S, VL, Left, Right); UserTreeIdx.EdgeIdx = 0; buildTree_rec(Left, Depth + 1, UserTreeIdx); UserTreeIdx.EdgeIdx = 1; @@ -2787,63 +2780,6 @@ int BoUpSLP::getGatherCost(ArrayRef VL) { return getGatherCost(VecTy, ShuffledElements); } -// Reorder commutative operations in alternate shuffle if the resulting vectors -// are consecutive loads. This would allow us to vectorize the tree. -// If we have something like- -// load a[0] - load b[0] -// load b[1] + load a[1] -// load a[2] - load b[2] -// load a[3] + load b[3] -// Reordering the second load b[1] load a[1] would allow us to vectorize this -// code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right) { - // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } - - // Reorder if we have a commutative operation and consecutive access - // are on either side of the alternate instructions. - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { - if (LoadInst *L = dyn_cast(Left[j])) { - if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - if (LoadInst *L = dyn_cast(Right[j])) { - if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - } -} - // Return true if the i'th left and right operands can be commuted. // // The vectorizer is trying to either have all elements one side being @@ -2918,10 +2854,13 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - if (!VL.empty()) { - // Peel the first iteration out of the loop since there's nothing - // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast(VL[0]); + assert(!VL.empty() && Left.empty() && Right.empty() && + "Unexpected instruction/operand lists"); + + // Push left and right operands of binary operation into Left and Right + for (Value *V : VL) { + auto *I = cast(V); + assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } @@ -2935,15 +2874,10 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S, for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(((I->getOpcode() == S.getOpcode() && I->isCommutative()) || - (I->getOpcode() != S.getOpcode() && - Instruction::isCommutative(S.getOpcode()))) && - "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - if (shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, + if (I->isCommutative() && + shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, AllSameOpcodeRight, SplatLeft, SplatRight)) std::swap(Left[i], Right[i]); @@ -2965,11 +2899,11 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S, // Finally check if we can get longer vectorizable chain by reordering // without breaking the good operand order detected above. // E.g. If we have something like- - // load a[0] load b[0] - // load b[1] load a[1] - // load a[2] load b[2] - // load a[3] load b[3] - // Reordering the second load b[1] load a[1] would allow us to vectorize + // load a[0] - load b[0] + // load b[1] + load a[1] + // load a[2] - load b[2] + // load a[3] + load b[3] + // Reordering the second load b[1] + load a[1] would allow us to vectorize // this code and we still retain AllSameOpcode property. // FIXME: This load reordering might break AllSameOpcode in some rare cases // such as- @@ -2981,16 +2915,32 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S, if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; + auto *VL1 = cast(VL[j]); + auto *VL2 = cast(VL[j + 1]); + if (VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + if (VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } } } } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; + auto *VL1 = cast(VL[j]); + auto *VL2 = cast(VL[j + 1]); + if (VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + if (VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } } } } diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll index c13f6503aef4..79a25c1e3b6c 100644 --- a/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -91,7 +91,7 @@ declare void @variadic(i32 %a, ...) define void @test_variadic() { call void(i32, ...) @variadic(i32 0, i64 1, double 2.0) ; CHECK: fmov d0, #2.0 -; CHECK: orr w1, wzr, #0x1 +; CHECK: mov w1, #1 ; CHECK: bl variadic ret void } diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll index af99734e6a6e..1a44a000d302 100644 --- a/test/CodeGen/AArch64/arm64-abi.ll +++ b/test/CodeGen/AArch64/arm64-abi.ll @@ -145,7 +145,7 @@ entry: ; CHECK-LABEL: test4 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] ; CHECK: str [[REG_2:w[0-9]+]], [sp] -; CHECK: orr w0, wzr, #0x3 +; CHECK: mov w0, #3 %0 = load double, double* %in, align 8 %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0, double %0, double %0, double %0, double %0, double %0, diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll index 836b7b8adc55..7db3ea76de05 100644 --- a/test/CodeGen/AArch64/arm64-abi_align.ll +++ b/test/CodeGen/AArch64/arm64-abi_align.ll @@ -294,7 +294,7 @@ entry: ; Space for s1 is allocated at fp-24 = sp+56 ; FAST: sub x[[A:[0-9]+]], x29, #24 ; Call memcpy with size = 24 (0x18) -; FAST: orr {{x[0-9]+}}, xzr, #0x18 +; FAST: mov {{x[0-9]+}}, #24 ; Space for s2 is allocated at sp+32 ; FAST: add x[[A:[0-9]+]], sp, #32 ; FAST: bl _memcpy @@ -337,7 +337,7 @@ entry: ; Space for s1 is allocated at fp-24 ; FAST: sub x[[A:[0-9]+]], x29, #24 ; Call memcpy with size = 24 (0x18) -; FAST: orr {{x[0-9]+}}, xzr, #0x18 +; FAST: mov {{x[0-9]+}}, #24 ; FAST: bl _memcpy ; Space for s2 is allocated at fp-48 ; FAST: sub x[[B:[0-9]+]], x29, #48 @@ -515,7 +515,7 @@ entry: ; FAST-LABEL: i64_split ; FAST: ldr x7, [{{x[0-9]+}}] ; FAST: mov x[[R0:[0-9]+]], sp -; FAST: orr w[[R1:[0-9]+]], wzr, #0x8 +; FAST: mov w[[R1:[0-9]+]], #8 ; FAST: str w[[R1]], {{\[}}x[[R0]]{{\]}} %0 = load i64, i64* bitcast (%struct.s41* @g41 to i64*), align 16 %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5, diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll index 16f8d0160633..6f7f97ef618c 100644 --- a/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/test/CodeGen/AArch64/arm64-addrmode.ll @@ -36,7 +36,7 @@ define void @t3(i64* %object) { ; base + unsigned offset (> imm12 * size of type in bytes) ; CHECK: @t4 -; CHECK: orr w[[NUM:[0-9]+]], wzr, #0x8000 +; CHECK: mov w[[NUM:[0-9]+]], #32768 ; CHECK: ldr xzr, [x0, x[[NUM]]] ; CHECK: ret define void @t4(i64* %object) { @@ -58,7 +58,7 @@ define void @t5(i64 %a) { ; base + reg + imm ; CHECK: @t6 ; CHECK: add [[ADDREG:x[0-9]+]], x1, x0, lsl #3 -; CHECK-NEXT: orr w[[NUM:[0-9]+]], wzr, #0x8000 +; CHECK-NEXT: mov w[[NUM:[0-9]+]], #32768 ; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]] ; CHECK: ret define void @t6(i64 %a, i64* %object) { @@ -71,7 +71,7 @@ define void @t6(i64 %a, i64* %object) { ; Test base + wide immediate define void @t7(i64 %a) { ; CHECK-LABEL: t7: -; CHECK: orr w[[NUM:[0-9]+]], wzr, #0xffff +; CHECK: mov w[[NUM:[0-9]+]], #65535 ; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]] %1 = add i64 %a, 65535 ;0xffff %2 = inttoptr i64 %1 to i64* diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll index a08cb8845005..516da6f919f9 100644 --- a/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/test/CodeGen/AArch64/arm64-build-vector.ll @@ -69,7 +69,7 @@ define void @widen_f16_build_vector(half* %addr) { define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) { ; CHECK-LABEL: single_element_vector_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: add d0, d0, d1 ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index 6b497e8f7bfd..88a512e08979 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -111,7 +111,7 @@ if.end: ; preds = %if.then, %lor.lhs.f ; CHECK: b.le [[BLOCK:LBB[0-9_]+]] ; CHECK: [[BLOCK]]: ; CHECK: bl _foo -; CHECK: orr w0, wzr, #0x7 +; CHECK: mov w0, #7 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp sgt i32 %a, 0 @@ -323,7 +323,7 @@ define i64 @gccbug(i64 %x0, i64 %x1) { ; CHECK: cmp x0, #2 ; CHECK-NEXT: ccmp x0, #4, #4, ne ; CHECK-NEXT: ccmp x1, #0, #0, eq -; CHECK-NEXT: orr w[[REGNUM:[0-9]+]], wzr, #0x1 +; CHECK-NEXT: mov w[[REGNUM:[0-9]+]], #1 ; CHECK-NEXT: cinc x0, x[[REGNUM]], eq ; CHECK-NEXT: ret %cmp0 = icmp eq i64 %x1, 0 diff --git a/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll index 528d2538bb4a..8550db9e6917 100644 --- a/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll +++ b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s | FileCheck %s -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 ; CHECK-NEXT: bl foo -; CHECK-NEXT: orr w0, wzr, #0x1 +; CHECK-NEXT: mov w0, #1 ; CHECK-NEXT: bl foo target triple = "aarch64--linux-android" diff --git a/test/CodeGen/AArch64/arm64-csel.ll b/test/CodeGen/AArch64/arm64-csel.ll index 0f3b7746e5d1..32d3119bcce7 100644 --- a/test/CodeGen/AArch64/arm64-csel.ll +++ b/test/CodeGen/AArch64/arm64-csel.ll @@ -113,7 +113,7 @@ define i32 @foo9(i32 %v) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo9: ; CHECK: cmp w0, #0 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[REG:[0-9]+]], #4 ; CHECK: cinv w0, w[[REG]], eq %tobool = icmp ne i32 %v, 0 %cond = select i1 %tobool, i32 4, i32 -5 @@ -124,7 +124,7 @@ define i64 @foo10(i64 %v) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo10: ; CHECK: cmp x0, #0 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[REG:[0-9]+]], #4 ; CHECK: cinv x0, x[[REG]], eq %tobool = icmp ne i64 %v, 0 %cond = select i1 %tobool, i64 4, i64 -5 @@ -135,7 +135,7 @@ define i32 @foo11(i32 %v) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo11: ; CHECK: cmp w0, #0 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[REG:[0-9]+]], #4 ; CHECK: cneg w0, w[[REG]], eq %tobool = icmp ne i32 %v, 0 %cond = select i1 %tobool, i32 4, i32 -4 @@ -146,7 +146,7 @@ define i64 @foo12(i64 %v) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo12: ; CHECK: cmp x0, #0 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[REG:[0-9]+]], #4 ; CHECK: cneg x0, x[[REG]], eq %tobool = icmp ne i64 %v, 0 %cond = select i1 %tobool, i64 4, i64 -4 @@ -179,7 +179,7 @@ define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo15: ; CHECK: cmp w0, w1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[REG:[0-9]+]], #1 ; CHECK: cinc w0, w[[REG]], gt %cmp = icmp sgt i32 %a, %b %. = select i1 %cmp, i32 2, i32 1 @@ -190,7 +190,7 @@ define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo16: ; CHECK: cmp w0, w1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[REG:[0-9]+]], #1 ; CHECK: cinc w0, w[[REG]], le %cmp = icmp sgt i32 %a, %b %. = select i1 %cmp, i32 1, i32 2 @@ -201,7 +201,7 @@ define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo17: ; CHECK: cmp x0, x1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[REG:[0-9]+]], #1 ; CHECK: cinc x0, x[[REG]], gt %cmp = icmp sgt i64 %a, %b %. = select i1 %cmp, i64 2, i64 1 @@ -212,7 +212,7 @@ define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp { entry: ; CHECK-LABEL: foo18: ; CHECK: cmp x0, x1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[REG:[0-9]+]], #1 ; CHECK: cinc x0, x[[REG]], le %cmp = icmp sgt i64 %a, %b %. = select i1 %cmp, i64 1, i64 2 @@ -233,7 +233,7 @@ entry: define i32 @foo20(i32 %x) { ; CHECK-LABEL: foo20: ; CHECK: cmp w0, #5 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK: mov w[[REG:[0-9]+]], #6 ; CHECK: csinc w0, w[[REG]], wzr, eq %cmp = icmp eq i32 %x, 5 %res = select i1 %cmp, i32 6, i32 1 @@ -243,7 +243,7 @@ define i32 @foo20(i32 %x) { define i64 @foo21(i64 %x) { ; CHECK-LABEL: foo21: ; CHECK: cmp x0, #5 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK: mov w[[REG:[0-9]+]], #6 ; CHECK: csinc x0, x[[REG]], xzr, eq %cmp = icmp eq i64 %x, 5 %res = select i1 %cmp, i64 6, i64 1 @@ -253,7 +253,7 @@ define i64 @foo21(i64 %x) { define i32 @foo22(i32 %x) { ; CHECK-LABEL: foo22: ; CHECK: cmp w0, #5 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK: mov w[[REG:[0-9]+]], #6 ; CHECK: csinc w0, w[[REG]], wzr, ne %cmp = icmp eq i32 %x, 5 %res = select i1 %cmp, i32 1, i32 6 @@ -263,7 +263,7 @@ define i32 @foo22(i32 %x) { define i64 @foo23(i64 %x) { ; CHECK-LABEL: foo23: ; CHECK: cmp x0, #5 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK: mov w[[REG:[0-9]+]], #6 ; CHECK: csinc x0, x[[REG]], xzr, ne %cmp = icmp eq i64 %x, 5 %res = select i1 %cmp, i64 1, i64 6 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll index dc1aac8409cc..6b5799bdefd9 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll @@ -81,14 +81,14 @@ entry: ; CHECK-LABEL: t2 ; CHECK: mov [[REG1:x[0-9]+]], xzr ; CHECK: mov x0, [[REG1]] -; CHECK: orr w1, wzr, #0xfffffff8 -; CHECK: orr [[REG2:w[0-9]+]], wzr, #0x3ff +; CHECK: mov w1, #-8 +; CHECK: mov [[REG2:w[0-9]+]], #1023 ; CHECK: uxth w2, [[REG2]] -; CHECK: orr [[REG3:w[0-9]+]], wzr, #0x2 +; CHECK: mov [[REG3:w[0-9]+]], #2 ; CHECK: sxtb w3, [[REG3]] ; CHECK: mov [[REG4:w[0-9]+]], wzr ; CHECK: and w4, [[REG4]], #0x1 -; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x1 +; CHECK: mov [[REG5:w[0-9]+]], #1 ; CHECK: and w5, [[REG5]], #0x1 ; CHECK: bl _func2 %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1) diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll index 51ec377ccaf4..9dc306b0e2b6 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll @@ -156,7 +156,7 @@ define zeroext i1 @fcmp_une(float %a, float %b) { define zeroext i1 @fcmp_true(float %a) { ; CHECK-LABEL: fcmp_true -; CHECK: orr {{w[0-9]+}}, wzr, #0x1 +; CHECK: mov {{w[0-9]+}}, #1 %1 = fcmp ueq float %a, %a ret i1 %1 } diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll index daccc86c709d..7dc849b7d2c4 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel.ll @@ -95,8 +95,8 @@ declare void @llvm.trap() nounwind define void @ands(i32* %addr) { ; CHECK-LABEL: ands: ; CHECK: tst [[COND:w[0-9]+]], #0x1 -; CHECK-NEXT: orr w{{[0-9]+}}, wzr, #0x2 -; CHECK-NEXT: orr w{{[0-9]+}}, wzr, #0x1 +; CHECK-NEXT: mov w{{[0-9]+}}, #2 +; CHECK-NEXT: mov w{{[0-9]+}}, #1 ; CHECK-NEXT: csel [[COND]], entry: %cond91 = select i1 undef, i32 1, i32 2 diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index b63e739f577d..8b6a4cae7ed5 100644 --- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -413,7 +413,7 @@ define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) { define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) { ; CHECK-LABEL: test_v16i8_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2 +; CHECK: mov w[[OFFSET:[0-9]+]], #2 ; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <16 x i8> %in, i32 3 store i8 %elt, i8* %addr @@ -435,7 +435,7 @@ define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) { define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) { ; CHECK-LABEL: test_v8i16_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[OFFSET:[0-9]+]], #4 ; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <8 x i16> %in, i32 3 store i16 %elt, i16* %addr @@ -456,7 +456,7 @@ define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) { define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) { ; CHECK-LABEL: test_v4i32_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8 +; CHECK: mov w[[OFFSET:[0-9]+]], #8 ; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <4 x i32> %in, i32 3 store i32 %elt, i32* %addr @@ -477,7 +477,7 @@ define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) { define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) { ; CHECK-LABEL: test_v4f32_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8 +; CHECK: mov w[[OFFSET:[0-9]+]], #8 ; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <4 x float> %in, i32 3 store float %elt, float* %addr @@ -498,7 +498,7 @@ define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) { define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) { ; CHECK-LABEL: test_v2i64_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10 +; CHECK: mov w[[OFFSET:[0-9]+]], #16 ; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]] %elt = extractelement <2 x i64> %in, i64 1 store i64 %elt, i64* %addr @@ -519,7 +519,7 @@ define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) { define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) { ; CHECK-LABEL: test_v2f64_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10 +; CHECK: mov w[[OFFSET:[0-9]+]], #16 ; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]] %elt = extractelement <2 x double> %in, i32 1 store double %elt, double* %addr @@ -540,7 +540,7 @@ define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) { define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) { ; CHECK-LABEL: test_v8i8_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2 +; CHECK: mov w[[OFFSET:[0-9]+]], #2 ; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <8 x i8> %in, i32 3 store i8 %elt, i8* %addr @@ -561,7 +561,7 @@ define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) { define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) { ; CHECK-LABEL: test_v4i16_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4 +; CHECK: mov w[[OFFSET:[0-9]+]], #4 ; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]] %elt = extractelement <4 x i16> %in, i32 3 store i16 %elt, i16* %addr @@ -582,7 +582,7 @@ define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) { define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) { ; CHECK-LABEL: test_v2i32_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8 +; CHECK: mov w[[OFFSET:[0-9]+]], #8 ; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]] %elt = extractelement <2 x i32> %in, i32 1 store i32 %elt, i32* %addr @@ -603,7 +603,7 @@ define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) { define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) { ; CHECK-LABEL: test_v2f32_post_reg_st1_lane: -; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8 +; CHECK: mov w[[OFFSET:[0-9]+]], #8 ; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]] %elt = extractelement <2 x float> %in, i32 1 store float %elt, float* %addr diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll index 848b87fd2cfb..b1ec988c5f14 100644 --- a/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -221,7 +221,7 @@ define void @test_zero_reg(i32* %addr) { ; CHECK: USE(wzr) tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1) -; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1 +; CHECK: mov [[VAL1:w[0-9]+]], #1 ; CHECK: USE([[VAL1]]) tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll index f6d66b692c35..9c27d1561c90 100644 --- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -61,7 +61,7 @@ entry: define void @t4(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t4: -; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20 +; CHECK: mov [[REG5:w[0-9]+]], #32 ; CHECK: strh [[REG5]], [x0, #16] ; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}] ; CHECK: str [[REG6]], [x0] diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll index 8216e3d8e5ba..8860ffa2962d 100644 --- a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll +++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -4,7 +4,7 @@ ; strict-alignment is turned on. define void @t0(i8* %out, i8* %in) { ; CHECK-LABEL: t0: -; CHECK: orr w2, wzr, #0x10 +; CHECK: mov w2, #16 ; CHECK-NEXT: bl _memcpy entry: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i1 false) diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll index 3063b500a324..9a1efba913af 100644 --- a/test/CodeGen/AArch64/arm64-movi.ll +++ b/test/CodeGen/AArch64/arm64-movi.ll @@ -42,21 +42,19 @@ define i64 @test64_64_manybits() nounwind { } ; 64-bit immed with 64-bit pattern size, one bit. -; FIXME: Prefer movz, so it prints as "mov". define i64 @test64_64_onebit() nounwind { ; CHECK-LABEL: test64_64_onebit: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x0, xzr, #0x4000000000 +; CHECK-NEXT: mov x0, #274877906944 ; CHECK-NEXT: ret ret i64 274877906944 } ; 32-bit immed with 32-bit pattern size, rotated by 16. -; FIXME: Prefer "movz" instead (so we print as "mov"). define i32 @test32_32_rot16() nounwind { ; CHECK-LABEL: test32_32_rot16: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0xff0000 +; CHECK-NEXT: mov w0, #16711680 ; CHECK-NEXT: ret ret i32 16711680 } diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll index bb3c36adee55..8af3807941e1 100644 --- a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll +++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll @@ -906,7 +906,7 @@ define <4 x i32> @cmhsz4xi32(<4 x i32> %A) { } define <2 x i64> @cmhsz2xi64(<2 x i64> %A) { -;CHECK: orr w[[TWO:[0-9]+]], wzr, #0x2 +;CHECK: mov w[[TWO:[0-9]+]], #2 ;CHECK-NEXT: dup v[[ZERO:[0-9]+]].2d, x[[TWO]] ;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d %tmp3 = icmp uge <2 x i64> %A, @@ -964,7 +964,7 @@ define <4 x i32> @cmhiz4xi32(<4 x i32> %A) { } define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { -;CHECK: orr w[[ONE:[0-9]+]], wzr, #0x1 +;CHECK: mov w[[ONE:[0-9]+]], #1 ;CHECK-NEXT: dup v[[ZERO:[0-9]+]].2d, x[[ONE]] ;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d %tmp3 = icmp ugt <2 x i64> %A, @@ -1105,7 +1105,7 @@ define <4 x i32> @cmloz4xi32(<4 x i32> %A) { define <2 x i64> @cmloz2xi64(<2 x i64> %A) { ; Using registers other than v0, v1 are possible, but would be odd. ; LO implemented as HI, so check reversed operands. -;CHECK: orr w[[TWO:[0-9]+]], wzr, #0x2 +;CHECK: mov w[[TWO:[0-9]+]], #2 ;CHECK-NEXT: dup v[[ZERO:[0-9]+]].2d, x[[TWO]] ;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d %tmp3 = icmp ult <2 x i64> %A, diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll index a7d92153f515..aa47aeb9db5b 100644 --- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -38,11 +38,11 @@ entry: define i64 @jscall_patchpoint_codegen2(i64 %callee) { entry: ; CHECK-LABEL: jscall_patchpoint_codegen2: -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK: mov w[[REG:[0-9]+]], #6 ; CHECK-NEXT: str x[[REG]], [sp, #24] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #4 ; CHECK-NEXT: str w[[REG]], [sp, #16] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x2 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #2 ; CHECK-NEXT: str x[[REG]], [sp] ; CHECK: Ltmp ; CHECK-NEXT: mov x16, #281470681743360 @@ -50,11 +50,11 @@ entry: ; CHECK-NEXT: movk x16, #48879 ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen2: -; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 +; FAST: mov [[REG1:x[0-9]+]], #2 ; FAST-NEXT: str [[REG1]], [sp] -; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 +; FAST-NEXT: mov [[REG2:w[0-9]+]], #4 ; FAST-NEXT: str [[REG2]], [sp, #16] -; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 +; FAST-NEXT: mov [[REG3:x[0-9]+]], #6 ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST: Ltmp ; FAST-NEXT: mov x16, #281470681743360 @@ -72,13 +72,13 @@ entry: ; CHECK-LABEL: jscall_patchpoint_codegen3: ; CHECK: mov w[[REG:[0-9]+]], #10 ; CHECK-NEXT: str x[[REG]], [sp, #48] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x8 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #8 ; CHECK-NEXT: str w[[REG]], [sp, #36] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x6 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #6 ; CHECK-NEXT: str x[[REG]], [sp, #24] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #4 ; CHECK-NEXT: str w[[REG]], [sp, #16] -; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x2 +; CHECK-NEXT: mov w[[REG:[0-9]+]], #2 ; CHECK-NEXT: str x[[REG]], [sp] ; CHECK: Ltmp ; CHECK-NEXT: mov x16, #281470681743360 @@ -86,13 +86,13 @@ entry: ; CHECK-NEXT: movk x16, #48879 ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen3: -; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 +; FAST: mov [[REG1:x[0-9]+]], #2 ; FAST-NEXT: str [[REG1]], [sp] -; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 +; FAST-NEXT: mov [[REG2:w[0-9]+]], #4 ; FAST-NEXT: str [[REG2]], [sp, #16] -; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 +; FAST-NEXT: mov [[REG3:x[0-9]+]], #6 ; FAST-NEXT: str [[REG3]], [sp, #24] -; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8 +; FAST-NEXT: mov [[REG4:w[0-9]+]], #8 ; FAST-NEXT: str [[REG4]], [sp, #36] ; FAST-NEXT: mov [[REG5:x[0-9]+]], #10 ; FAST-NEXT: str [[REG5]], [sp, #48] diff --git a/test/CodeGen/AArch64/atomic-ops-lse.ll b/test/CodeGen/AArch64/atomic-ops-lse.ll index bebb285df4ad..a75614d20f7b 100644 --- a/test/CodeGen/AArch64/atomic-ops-lse.ll +++ b/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -821,7 +821,7 @@ define i8 @test_atomic_load_sub_i8_neg_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 -; CHECK: orr w[[IMM:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddalb w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -835,7 +835,7 @@ define i16 @test_atomic_load_sub_i16_neg_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 -; CHECK: orr w[[IMM:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddalh w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -849,7 +849,7 @@ define i32 @test_atomic_load_sub_i32_neg_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 -; CHECK: orr w[[IMM:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddal w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -863,7 +863,7 @@ define i64 @test_atomic_load_sub_i64_neg_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 -; CHECK: orr w[[IMM:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddal x[[IMM]], x[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -984,7 +984,7 @@ define i8 @test_atomic_load_and_i8_inv_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 -; CHECK: orr w[[CONST:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclralb w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i8 %old @@ -996,7 +996,7 @@ define i16 @test_atomic_load_and_i16_inv_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 -; CHECK: orr w[[CONST:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclralh w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i16 %old @@ -1008,7 +1008,7 @@ define i32 @test_atomic_load_and_i32_inv_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 -; CHECK: orr w[[CONST:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclral w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i32 %old @@ -1020,7 +1020,7 @@ define i64 @test_atomic_load_and_i64_inv_imm() nounwind { ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 -; CHECK: orr w[[CONST:[0-9]+]], wzr, #0x1 +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclral x[[CONST]], x[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i64 %old diff --git a/test/CodeGen/AArch64/branch-relax-asm.ll b/test/CodeGen/AArch64/branch-relax-asm.ll index 7409c84e6180..89d0529c9667 100644 --- a/test/CodeGen/AArch64/branch-relax-asm.ll +++ b/test/CodeGen/AArch64/branch-relax-asm.ll @@ -10,7 +10,7 @@ define i32 @test_asm_length(i32 %in) { ; CHECK: b [[FALSE:LBB[0-9]+_[0-9]+]] ; CHECK: [[TRUE]]: -; CHECK: orr w0, wzr, #0x4 +; CHECK: mov w0, #4 ; CHECK: nop ; CHECK: nop ; CHECK: nop diff --git a/test/CodeGen/AArch64/branch-relax-bcc.ll b/test/CodeGen/AArch64/branch-relax-bcc.ll index 636acf0a8b82..d0b3b0568d77 100644 --- a/test/CodeGen/AArch64/branch-relax-bcc.ll +++ b/test/CodeGen/AArch64/branch-relax-bcc.ll @@ -57,7 +57,7 @@ declare i32 @foo() #0 ; CHECK-NOT: b L ; CHECK: [[IF_END_BB]]: -; CHECK: #0x7 +; CHECK: mov{{.*}}, #7 ; CHECK: ret define i32 @block_split(i32 %a, i32 %b) #0 { entry: diff --git a/test/CodeGen/AArch64/bswap-known-bits.ll b/test/CodeGen/AArch64/bswap-known-bits.ll index e5de7953d1b8..5a3e747859cd 100644 --- a/test/CodeGen/AArch64/bswap-known-bits.ll +++ b/test/CodeGen/AArch64/bswap-known-bits.ll @@ -4,7 +4,7 @@ declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) ; CHECK-LABEL: @test1 -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 define i1 @test1(i16 %arg) { %a = or i16 %arg, 511 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -14,7 +14,7 @@ define i1 @test1(i16 %arg) { } ; CHECK-LABEL: @test2 -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 define i1 @test2(i16 %arg) { %a = or i16 %arg, 1 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -24,7 +24,7 @@ define i1 @test2(i16 %arg) { } ; CHECK-LABEL: @test3 -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 define i1 @test3(i16 %arg) { %a = or i16 %arg, 256 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -34,7 +34,7 @@ define i1 @test3(i16 %arg) { } ; CHECK-LABEL: @test4 -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 define i1 @test4(i32 %arg) { %a = or i32 %arg, 2147483647 ; i32_MAX %b = call i32 @llvm.bswap.i32(i32 %a) diff --git a/test/CodeGen/AArch64/cmpxchg-idioms.ll b/test/CodeGen/AArch64/cmpxchg-idioms.ll index 5ff3ddfe09a4..088710c67379 100644 --- a/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -12,7 +12,7 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) { ; CHECK: cbnz [[STATUS]], [[LOOP]] ; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}} -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 ; CHECK: ret ; CHECK: [[FAILED]]: @@ -39,7 +39,7 @@ define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) { ; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}} ; FIXME: DAG combine should be able to deal with this. -; CHECK: orr [[TMP:w[0-9]+]], wzr, #0x1 +; CHECK: mov [[TMP:w[0-9]+]], #1 ; CHECK: eor w0, [[TMP]], #0x1 ; CHECK: ret @@ -100,7 +100,7 @@ define i1 @test_conditional2(i32 %a, i32 %b, i32* %c) { ; CHECK: stlxr [[STATUS:w[0-9]+]], w20, [x19] ; CHECK: cbnz [[STATUS]], [[LOOP]] -; CHECK: orr [[STATUS]], wzr, #0x1 +; CHECK: mov [[STATUS]], #1 ; CHECK: b [[PH:LBB[0-9]+_[0-9]+]] ; CHECK: [[FAILED]]: @@ -108,8 +108,8 @@ define i1 @test_conditional2(i32 %a, i32 %b, i32* %c) { ; verify the preheader is simplified by simplifycfg. ; CHECK: [[PH]]: -; CHECK: orr w22, wzr, #0x2 -; CHECK-NOT: orr w22, wzr, #0x4 +; CHECK: mov w22, #2 +; CHECK-NOT: mov w22, #4 ; CHECK-NOT: cmn w22, #4 ; CHECK: b [[LOOP2:LBB[0-9]+_[0-9]+]] ; CHECK-NOT: b.ne [[LOOP2]] diff --git a/test/CodeGen/AArch64/cond-sel-value-prop.ll b/test/CodeGen/AArch64/cond-sel-value-prop.ll index dd87afce4b00..155e6e377392 100644 --- a/test/CodeGen/AArch64/cond-sel-value-prop.ll +++ b/test/CodeGen/AArch64/cond-sel-value-prop.ll @@ -3,7 +3,7 @@ ; Transform "a == C ? C : x" to "a == C ? a : x" to avoid materializing C. ; CHECK-LABEL: test1: ; CHECK: cmp w[[REG1:[0-9]+]], #2 -; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x7 +; CHECK: mov w[[REG2:[0-9]+]], #7 ; CHECK: csel w0, w[[REG1]], w[[REG2]], eq define i32 @test1(i32 %x) { %cmp = icmp eq i32 %x, 2 @@ -14,7 +14,7 @@ define i32 @test1(i32 %x) { ; Transform "a == C ? C : x" to "a == C ? a : x" to avoid materializing C. ; CHECK-LABEL: test2: ; CHECK: cmp x[[REG1:[0-9]+]], #2 -; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x7 +; CHECK: mov w[[REG2:[0-9]+]], #7 ; CHECK: csel x0, x[[REG1]], x[[REG2]], eq define i64 @test2(i64 %x) { %cmp = icmp eq i64 %x, 2 @@ -25,7 +25,7 @@ define i64 @test2(i64 %x) { ; Transform "a != C ? x : C" to "a != C ? x : a" to avoid materializing C. ; CHECK-LABEL: test3: ; CHECK: cmp x[[REG1:[0-9]+]], #7 -; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2 +; CHECK: mov w[[REG2:[0-9]+]], #2 ; CHECK: csel x0, x[[REG2]], x[[REG1]], ne define i64 @test3(i64 %x) { %cmp = icmp ne i64 %x, 7 @@ -37,7 +37,7 @@ define i64 @test3(i64 %x) { ; would needlessly extend the live range of x0 when we can just use xzr. ; CHECK-LABEL: test4: ; CHECK: cmp x0, #0 -; CHECK: orr w8, wzr, #0x7 +; CHECK: mov w8, #7 ; CHECK: csel x0, xzr, x8, eq define i64 @test4(i64 %x) { %cmp = icmp eq i64 %x, 0 @@ -50,7 +50,7 @@ define i64 @test4(i64 %x) { ; CSINC to materialize the 1. ; CHECK-LABEL: test5: ; CHECK: cmp x0, #1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x7 +; CHECK: mov w[[REG:[0-9]+]], #7 ; CHECK: csinc x0, x[[REG]], xzr, ne define i64 @test5(i64 %x) { %cmp = icmp eq i64 %x, 1 @@ -63,7 +63,7 @@ define i64 @test5(i64 %x) { ; CSINV to materialize the -1. ; CHECK-LABEL: test6: ; CHECK: cmn x0, #1 -; CHECK: orr w[[REG:[0-9]+]], wzr, #0x7 +; CHECK: mov w[[REG:[0-9]+]], #7 ; CHECK: csinv x0, x[[REG]], xzr, ne define i64 @test6(i64 %x) { %cmp = icmp eq i64 %x, -1 diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll index 691cbcf1a5df..fc4b42d6091c 100644 --- a/test/CodeGen/AArch64/cond-sel.ll +++ b/test/CodeGen/AArch64/cond-sel.ll @@ -45,7 +45,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r ; CHECK-NOFP-NOT: fcmp %val2 = select i1 %tst2, i64 9, i64 15 store i64 %val2, i64* @var64 -; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf +; CHECK: mov w[[CONST15:[0-9]+]], #15 ; CHECK: mov {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}} ; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq ; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll index a2fa1db8a8ac..107f1eb0ccfd 100644 --- a/test/CodeGen/AArch64/dag-combine-invaraints.ll +++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll @@ -24,8 +24,8 @@ main_: ret i32 0 ; CHECK: main: -; CHECK-DAG: mov -; CHECK-DAG: orr +; CHECK-DAG: mov {{.*}}, #15 +; CHECK-DAG: mov {{.*}}, #5 ; CHECK: csel } diff --git a/test/CodeGen/AArch64/extract-bits.ll b/test/CodeGen/AArch64/extract-bits.ll index b1a203445a79..6bec84a14c4e 100644 --- a/test/CodeGen/AArch64/extract-bits.ll +++ b/test/CodeGen/AArch64/extract-bits.ll @@ -21,7 +21,7 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: lsr w8, w0, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -37,7 +37,7 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: asr w8, w0, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -53,7 +53,7 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) n define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: lsr w8, w0, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -72,7 +72,7 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; CHECK-LABEL: bextr32_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: sub w9, w9, #1 // =1 ; CHECK-NEXT: lsr w8, w8, w1 @@ -90,7 +90,7 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe ; CHECK-LABEL: bextr32_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: sub w9, w9, #1 // =1 ; CHECK-NEXT: lsr w8, w8, w1 @@ -109,7 +109,7 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: lsr w8, w0, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -127,7 +127,7 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub x9, x9, #1 // =1 @@ -143,7 +143,7 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: asr x8, x0, x1 ; CHECK-NEXT: sub x9, x9, #1 // =1 @@ -159,7 +159,7 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -180,7 +180,7 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; CHECK-LABEL: bextr64_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: lsr x8, x8, x1 @@ -198,7 +198,7 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe ; CHECK-LABEL: bextr64_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -219,7 +219,7 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub x9, x9, #1 // =1 @@ -238,7 +238,7 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -256,7 +256,7 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a1: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -275,7 +275,7 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a2: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w2 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub w9, w9, #1 // =1 @@ -550,7 +550,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -589,7 +589,7 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) ; CHECK-LABEL: bextr32_c3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -644,7 +644,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: mov x10, #-1 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -683,7 +683,7 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) ; CHECK-LABEL: bextr64_c3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: mov x10, #-1 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -797,7 +797,7 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr w8, w0, w1 ; CHECK-NEXT: sub w9, w9, w2 @@ -834,7 +834,7 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) ; CHECK-LABEL: bextr32_d3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: lsr w8, w8, w1 @@ -871,7 +871,7 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x8, x0, x1 ; CHECK-NEXT: sub w9, w9, w2 @@ -908,7 +908,7 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) ; CHECK-LABEL: bextr64_d3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: lsr x8, x8, x1 diff --git a/test/CodeGen/AArch64/extract-lowbits.ll b/test/CodeGen/AArch64/extract-lowbits.ll index e669a5d9cf74..22c699dc1a12 100644 --- a/test/CodeGen/AArch64/extract-lowbits.ll +++ b/test/CodeGen/AArch64/extract-lowbits.ll @@ -21,7 +21,7 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: lsl w8, w8, w1 ; CHECK-NEXT: sub w8, w8, #1 // =1 ; CHECK-NEXT: and w0, w8, w0 @@ -35,7 +35,7 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind { define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: lsl w8, w8, w1 ; CHECK-NEXT: sub w8, w8, #1 // =1 ; CHECK-NEXT: and w0, w8, w0 @@ -51,7 +51,7 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 ; CHECK-NEXT: and w0, w9, w8 @@ -67,7 +67,7 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl w9, w9, w1 ; CHECK-NEXT: sub w9, w9, #1 // =1 ; CHECK-NEXT: and w0, w9, w8 @@ -83,7 +83,7 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind { define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: lsl w8, w8, w1 ; CHECK-NEXT: sub w8, w8, #1 // =1 ; CHECK-NEXT: and w0, w0, w8 @@ -99,7 +99,7 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind { define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: lsl x8, x8, x1 ; CHECK-NEXT: sub x8, x8, #1 // =1 ; CHECK-NEXT: and x0, x8, x0 @@ -113,7 +113,7 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind { define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsl x8, x8, x1 ; CHECK-NEXT: sub x8, x8, #1 // =1 @@ -130,7 +130,7 @@ define i64 @bzhi64_a2_load(i64* %w, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: lsl x9, x9, x1 ; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: and x0, x9, x8 @@ -146,7 +146,7 @@ define i64 @bzhi64_a3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsl x9, x9, x1 ; CHECK-NEXT: sub x9, x9, #1 // =1 @@ -163,7 +163,7 @@ define i64 @bzhi64_a3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind { define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: lsl x8, x8, x1 ; CHECK-NEXT: sub x8, x8, #1 // =1 ; CHECK-NEXT: and x0, x0, x8 @@ -345,7 +345,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x20 +; CHECK-NEXT: mov w8, #32 ; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: lsr w8, w9, w8 @@ -378,7 +378,7 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_c3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: sub w9, w9, w1 ; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: lsr w9, w10, w9 @@ -425,7 +425,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x40 +; CHECK-NEXT: mov w8, #64 ; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov x9, #-1 ; CHECK-NEXT: lsr x8, x9, x8 @@ -458,7 +458,7 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_c3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: sub w9, w9, w1 ; CHECK-NEXT: mov x10, #-1 ; CHECK-NEXT: lsr x9, x10, x9 @@ -506,7 +506,7 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x20 +; CHECK-NEXT: mov w8, #32 ; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: lsl w9, w0, w8 ; CHECK-NEXT: lsr w0, w9, w8 @@ -537,7 +537,7 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_d3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x20 +; CHECK-NEXT: mov w9, #32 ; CHECK-NEXT: sub w9, w9, w1 ; CHECK-NEXT: lsl w8, w8, w9 ; CHECK-NEXT: lsr w0, w8, w9 @@ -568,7 +568,7 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x40 +; CHECK-NEXT: mov w8, #64 ; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: lsl x9, x0, x8 ; CHECK-NEXT: lsr x0, x9, x8 @@ -599,7 +599,7 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_d3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: orr w9, wzr, #0x40 +; CHECK-NEXT: mov w9, #64 ; CHECK-NEXT: sub w9, w9, w1 ; CHECK-NEXT: lsl x8, x8, x9 ; CHECK-NEXT: lsr x0, x8, x9 diff --git a/test/CodeGen/AArch64/fabs.ll b/test/CodeGen/AArch64/fabs.ll index 58f047afa777..e702040f8e67 100644 --- a/test/CodeGen/AArch64/fabs.ll +++ b/test/CodeGen/AArch64/fabs.ll @@ -22,7 +22,7 @@ define double @not_fabs(double %x) #0 { define float @still_not_fabs(float %x) #0 { ; CHECK-LABEL: still_not_fabs: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x80000000 +; CHECK-NEXT: mov w8, #-2147483648 ; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fneg s1, s0 ; CHECK-NEXT: fcmp s0, s2 @@ -72,7 +72,7 @@ define <4 x float> @nabsv4f32(<4 x float> %a) { define <2 x double> @nabsv2d64(<2 x double> %a) { ; CHECK-LABEL: nabsv2d64: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, xzr, #0x8000000000000000 +; CHECK-NEXT: mov x8, #-9223372036854775808 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll index 6ab6a66c355d..7aafada46647 100644 --- a/test/CodeGen/AArch64/fast-isel-addressing-modes.ll +++ b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll @@ -110,7 +110,7 @@ define void @store_breg_f64(double* %a) { ; Load Immediate define i32 @load_immoff_1() { ; CHECK-LABEL: load_immoff_1 -; CHECK: orr {{w|x}}[[REG:[0-9]+]], {{wzr|xzr}}, #0x80 +; CHECK: mov {{w|x}}[[REG:[0-9]+]], #128 ; CHECK: ldr {{w[0-9]+}}, {{\[}}x[[REG]]{{\]}} %1 = inttoptr i64 128 to i32* %2 = load i32, i32* %1 @@ -173,7 +173,7 @@ define i32 @load_breg_immoff_5(i64 %a) { ; Min un-supported scaled offset define i32 @load_breg_immoff_6(i64 %a) { ; SDAG-LABEL: load_breg_immoff_6 -; SDAG: orr w[[NUM:[0-9]+]], wzr, #0x4000 +; SDAG: mov w[[NUM:[0-9]+]], #16384 ; SDAG-NEXT: ldr {{w[0-9]+}}, [x0, x[[NUM]]] ; FAST-LABEL: load_breg_immoff_6 ; FAST: add [[REG:x[0-9]+]], x0, #4, lsl #12 @@ -239,7 +239,7 @@ define void @store_breg_immoff_5(i64 %a) { ; Min un-supported scaled offset define void @store_breg_immoff_6(i64 %a) { ; SDAG-LABEL: store_breg_immoff_6 -; SDAG: orr w[[NUM:[0-9]+]], wzr, #0x4000 +; SDAG: mov w[[NUM:[0-9]+]], #16384 ; SDAG-NEXT: str wzr, [x0, x[[NUM]]] ; FAST-LABEL: store_breg_immoff_6 ; FAST: add [[REG:x[0-9]+]], x0, #4, lsl #12 @@ -304,7 +304,7 @@ define i64 @load_breg_offreg_immoff_1(i64 %a, i64 %b) { define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) { ; SDAG-LABEL: load_breg_offreg_immoff_2 ; SDAG: add [[REG1:x[0-9]+]], x0, x1 -; SDAG-NEXT: orr w[[NUM:[0-9]+]], wzr, #0xf000 +; SDAG-NEXT: mov w[[NUM:[0-9]+]], #61440 ; SDAG-NEXT: ldr x0, {{\[}}[[REG1]], x[[NUM]]] ; FAST-LABEL: load_breg_offreg_immoff_2 ; FAST: add [[REG:x[0-9]+]], x0, #15, lsl #12 diff --git a/test/CodeGen/AArch64/fast-isel-gep.ll b/test/CodeGen/AArch64/fast-isel-gep.ll index 665476969ecb..2d38bc44c4ec 100644 --- a/test/CodeGen/AArch64/fast-isel-gep.ll +++ b/test/CodeGen/AArch64/fast-isel-gep.ll @@ -11,7 +11,7 @@ define double* @test_struct(%struct.foo* %f) { define i32* @test_array1(i32* %a, i64 %i) { ; CHECK-LABEL: test_array1 -; CHECK: orr [[REG:x[0-9]+]], xzr, #0x4 +; CHECK: mov [[REG:x[0-9]+]], #4 ; CHECK-NEXT: madd x0, x1, [[REG]], x0 %1 = getelementptr inbounds i32, i32* %a, i64 %i ret i32* %1 @@ -42,7 +42,7 @@ define i32* @test_array4(i32* %a) { define i32* @test_array5(i32* %a, i32 %i) { ; CHECK-LABEL: test_array5 ; CHECK: sxtw [[REG1:x[0-9]+]], w1 -; CHECK-NEXT: orr [[REG2:x[0-9]+]], xzr, #0x4 +; CHECK-NEXT: mov [[REG2:x[0-9]+]], #4 ; CHECK-NEXT: madd {{x[0-9]+}}, [[REG1]], [[REG2]], x0 %1 = getelementptr inbounds i32, i32* %a, i32 %i ret i32* %1 diff --git a/test/CodeGen/AArch64/fold-global-offsets.ll b/test/CodeGen/AArch64/fold-global-offsets.ll index ffcdc2bee5ff..40235791c524 100644 --- a/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/test/CodeGen/AArch64/fold-global-offsets.ll @@ -52,7 +52,7 @@ define i64 @f6() { ; CHECK: f6: ; CHECK: adrp x8, x2 ; CHECK: add x8, x8, :lo12:x2 - ; CHECK: orr w9, wzr, #0x200000 + ; CHECK: mov w9, #2097152 ; CHECK: ldr x0, [x8, x9] ; CHECK: ret %l = load i64, i64* getelementptr ([16777216 x i64], [16777216 x i64]* @x2, i64 0, i64 262144) diff --git a/test/CodeGen/AArch64/funnel-shift.ll b/test/CodeGen/AArch64/funnel-shift.ll index a1fac70545c5..67ca729894e5 100644 --- a/test/CodeGen/AArch64/funnel-shift.ll +++ b/test/CodeGen/AArch64/funnel-shift.ll @@ -71,7 +71,7 @@ define i7 @fshl_i7_const_fold() { define i8 @fshl_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x80 +; CHECK-NEXT: mov w0, #128 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -80,7 +80,7 @@ define i8 @fshl_i8_const_fold_overshift_1() { define i8 @fshl_i8_const_fold_overshift_2() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_2: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x78 +; CHECK-NEXT: mov w0, #120 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11) ret i8 %f @@ -133,7 +133,7 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) { define i8 @fshl_i8_const_fold() { ; CHECK-LABEL: fshl_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x80 +; CHECK-NEXT: mov w0, #128 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7) ret i8 %f @@ -190,7 +190,7 @@ declare i7 @llvm.fshr.i7(i7, i7, i7) define i7 @fshr_i7_const_fold() { ; CHECK-LABEL: fshr_i7_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x1f +; CHECK-NEXT: mov w0, #31 ; CHECK-NEXT: ret %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2) ret i7 %f @@ -199,7 +199,7 @@ define i7 @fshr_i7_const_fold() { define i8 @fshr_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0xfe +; CHECK-NEXT: mov w0, #254 ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -217,7 +217,7 @@ define i8 @fshr_i8_const_fold_overshift_2() { define i8 @fshr_i8_const_fold_overshift_3() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_3: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0xff +; CHECK-NEXT: mov w0, #255 ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8) ret i8 %f @@ -261,7 +261,7 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) { define i8 @fshr_i8_const_fold() { ; CHECK-LABEL: fshr_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0xfe +; CHECK-NEXT: mov w0, #254 ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7) ret i8 %f diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll index c948739853bb..ee0b8077baf0 100644 --- a/test/CodeGen/AArch64/i128-align.ll +++ b/test/CodeGen/AArch64/i128-align.ll @@ -13,7 +13,7 @@ define i64 @check_size() { %diff = sub i64 %endi, %starti ret i64 %diff -; CHECK: {{movz x0, #48|orr w0, wzr, #0x30}} +; CHECK: mov w0, #48 } define i64 @check_field() { @@ -25,5 +25,5 @@ define i64 @check_field() { %diff = sub i64 %endi, %starti ret i64 %diff -; CHECK: {{movz x0, #16|orr w0, wzr, #0x10}} +; CHECK: mov w0, #16 } diff --git a/test/CodeGen/AArch64/isinf.ll b/test/CodeGen/AArch64/isinf.ll index e4607d08b666..bcb249f82f3a 100644 --- a/test/CodeGen/AArch64/isinf.ll +++ b/test/CodeGen/AArch64/isinf.ll @@ -22,7 +22,7 @@ define i32 @replace_isinf_call_f16(half %x) { ; Check if INFINITY for float is materialized define i32 @replace_isinf_call_f32(float %x) { ; CHECK-LABEL: replace_isinf_call_f32: -; CHECK: orr [[INFSCALARREG:w[0-9]+]], wzr, #0x7f800000 +; CHECK: mov [[INFSCALARREG:w[0-9]+]], #2139095040 ; CHECK-NEXT: fabs [[ABS:s[0-9]+]], s0 ; CHECK-NEXT: fmov [[INFREG:s[0-9]+]], [[INFSCALARREG]] ; CHECK-NEXT: fcmp [[ABS]], [[INFREG]] @@ -36,7 +36,7 @@ define i32 @replace_isinf_call_f32(float %x) { ; Check if INFINITY for double is materialized define i32 @replace_isinf_call_f64(double %x) { ; CHECK-LABEL: replace_isinf_call_f64: -; CHECK: orr [[INFSCALARREG:x[0-9]+]], xzr, #0x7ff0000000000000 +; CHECK: mov [[INFSCALARREG:x[0-9]+]], #9218868437227405312 ; CHECK-NEXT: fabs [[ABS:d[0-9]+]], d0 ; CHECK-NEXT: fmov [[INFREG:d[0-9]+]], [[INFSCALARREG]] ; CHECK-NEXT: fcmp [[ABS]], [[INFREG]] diff --git a/test/CodeGen/AArch64/known-never-nan.ll b/test/CodeGen/AArch64/known-never-nan.ll index ef9fa5faac60..f86667d6ec60 100644 --- a/test/CodeGen/AArch64/known-never-nan.ll +++ b/test/CodeGen/AArch64/known-never-nan.ll @@ -28,7 +28,7 @@ define float @fmaxnm(i32 %i1, i32 %i2) #0 { define float @not_fmaxnm_maybe_nan(i32 %i1, i32 %i2) #0 { ; CHECK-LABEL: not_fmaxnm_maybe_nan: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0xff800000 +; CHECK-NEXT: mov w8, #-8388608 ; CHECK-NEXT: ucvtf s0, w0 ; CHECK-NEXT: ucvtf s1, w1 ; CHECK-NEXT: fmov s2, #17.00000000 diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll index 9bea40656b9e..f36131223b04 100644 --- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -11,7 +11,7 @@ define i32 @main() local_unnamed_addr #1 { ; Make sure the stores happen in the correct order (the exact instructions could change). ; CHECK-LABEL: main: -; CHECK: orr w9, wzr, #0x1 +; CHECK: mov w9, #1 ; CHECK: str x9, [sp, #80] ; CHECK: stp q0, q0, [sp, #48] ; CHECK: ldr w8, [sp, #48] diff --git a/test/CodeGen/AArch64/machine-outliner-tail.ll b/test/CodeGen/AArch64/machine-outliner-tail.ll index 751128c7f350..60107d5b21c1 100644 --- a/test/CodeGen/AArch64/machine-outliner-tail.ll +++ b/test/CodeGen/AArch64/machine-outliner-tail.ll @@ -1,10 +1,10 @@ ; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-linux-gnu < %s | FileCheck %s ; CHECK: OUTLINED_FUNCTION_0: -; CHECK: orr w0, wzr, #0x1 -; CHECK-NEXT: orr w1, wzr, #0x2 -; CHECK-NEXT: orr w2, wzr, #0x3 -; CHECK-NEXT: orr w3, wzr, #0x4 +; CHECK: mov w0, #1 +; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: mov w2, #3 +; CHECK-NEXT: mov w3, #4 ; CHECK-NEXT: b z define void @a() { diff --git a/test/CodeGen/AArch64/machine-outliner-thunk.ll b/test/CodeGen/AArch64/machine-outliner-thunk.ll index fb4265af2d42..d1dfae81336c 100644 --- a/test/CodeGen/AArch64/machine-outliner-thunk.ll +++ b/test/CodeGen/AArch64/machine-outliner-thunk.ll @@ -73,16 +73,16 @@ entry: ; CHECK: [[OUTLINED_INDIRECT]]: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: orr w0, wzr, #0x1 -; CHECK-NEXT: orr w1, wzr, #0x2 -; CHECK-NEXT: orr w2, wzr, #0x3 -; CHECK-NEXT: orr w3, wzr, #0x4 +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: mov w2, #3 +; CHECK-NEXT: mov w3, #4 ; CHECK-NEXT: br x8 ; CHECK: [[OUTLINED_DIRECT]]: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x1 -; CHECK-NEXT: orr w1, wzr, #0x2 -; CHECK-NEXT: orr w2, wzr, #0x3 -; CHECK-NEXT: orr w3, wzr, #0x4 +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: mov w2, #3 +; CHECK-NEXT: mov w3, #4 ; CHECK-NEXT: b thunk_called_fn diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll index 42d0a09f0282..15afdd43d116 100644 --- a/test/CodeGen/AArch64/machine-outliner.ll +++ b/test/CodeGen/AArch64/machine-outliner.ll @@ -91,17 +91,17 @@ define void @dog() #0 { ; ODR: [[OUTLINED]]: ; CHECK: .p2align 2 ; CHECK-NEXT: [[OUTLINED]]: -; CHECK: orr w8, wzr, #0x1 +; CHECK: mov w8, #1 ; CHECK-NEXT: str w8, [sp, #28] -; CHECK-NEXT: orr w8, wzr, #0x2 +; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: str w8, [sp, #24] -; CHECK-NEXT: orr w8, wzr, #0x3 +; CHECK-NEXT: mov w8, #3 ; CHECK-NEXT: str w8, [sp, #20] -; CHECK-NEXT: orr w8, wzr, #0x4 +; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: str w8, [sp, #16] ; CHECK-NEXT: mov w8, #5 ; CHECK-NEXT: str w8, [sp, #12] -; CHECK-NEXT: orr w8, wzr, #0x6 +; CHECK-NEXT: mov w8, #6 ; CHECK-NEXT: str w8, [sp, #8] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll b/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll index f1cd21dce45a..79fefdb2769a 100644 --- a/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll +++ b/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll @@ -5,9 +5,9 @@ ; The verifier would complain otherwise. define i64 @csed-impdef-killflag(i64 %a) { ; CHECK-LABEL: csed-impdef-killflag -; CHECK-DAG: orr [[REG1:w[0-9]+]], wzr, #0x1 -; CHECK-DAG: orr [[REG2:x[0-9]+]], xzr, #0x2 -; CHECK-DAG: orr [[REG3:x[0-9]+]], xzr, #0x3 +; CHECK-DAG: mov [[REG1:w[0-9]+]], #1 +; CHECK-DAG: mov [[REG2:x[0-9]+]], #2 +; CHECK-DAG: mov [[REG3:x[0-9]+]], #3 ; CHECK-DAG: cmp x0, #0 ; CHECK: csel w[[SELECT_WREG_1:[0-9]+]], wzr, [[REG1]], ne ; CHECK-DAG: csel [[SELECT_XREG_2:x[0-9]+]], [[REG2]], [[REG3]], ne diff --git a/test/CodeGen/AArch64/madd-combiner.ll b/test/CodeGen/AArch64/madd-combiner.ll index 7c9787a7281a..8a3b5fdcee87 100644 --- a/test/CodeGen/AArch64/madd-combiner.ll +++ b/test/CodeGen/AArch64/madd-combiner.ll @@ -13,7 +13,7 @@ define i32 @mul_add_imm(i32 %a, i32 %b) { define i32 @mul_sub_imm1(i32 %a, i32 %b) { ; CHECK-LABEL: mul_sub_imm1 -; CHECK: orr [[REG:w[0-9]+]], wzr, #0x4 +; CHECK: mov [[REG:w[0-9]+]], #4 ; CHECK-NEXT: msub {{w[0-9]+}}, w0, w1, [[REG]] %1 = mul i32 %a, %b %2 = sub i32 4, %1 diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll index def6072e0bca..4228f1a25d6e 100644 --- a/test/CodeGen/AArch64/movw-consts.ll +++ b/test/CodeGen/AArch64/movw-consts.ll @@ -9,43 +9,43 @@ define i64 @test0() { define i64 @test1() { ; CHECK-LABEL: test1: -; CHECK: orr w0, wzr, #0x1 +; CHECK: mov w0, #1 ret i64 1 } define i64 @test2() { ; CHECK-LABEL: test2: -; CHECK: orr w0, wzr, #0xffff +; CHECK: mov w0, #65535 ret i64 65535 } define i64 @test3() { ; CHECK-LABEL: test3: -; CHECK: orr w0, wzr, #0x10000 +; CHECK: mov w0, #65536 ret i64 65536 } define i64 @test4() { ; CHECK-LABEL: test4: -; CHECK: orr w0, wzr, #0xffff0000 +; CHECK: mov w0, #-65536 ret i64 4294901760 } define i64 @test5() { ; CHECK-LABEL: test5: -; CHECK: orr x0, xzr, #0x100000000 +; CHECK: mov x0, #4294967296 ret i64 4294967296 } define i64 @test6() { ; CHECK-LABEL: test6: -; CHECK: orr x0, xzr, #0xffff00000000 +; CHECK: mov x0, #281470681743360 ret i64 281470681743360 } define i64 @test7() { ; CHECK-LABEL: test7: -; CHECK: orr x0, xzr, #0x1000000000000 +; CHECK: mov x0, #281474976710656 ret i64 281474976710656 } @@ -82,28 +82,28 @@ define void @test11() { define void @test12() { ; CHECK-LABEL: test12: -; CHECK: orr {{w[0-9]+}}, wzr, #0x1 +; CHECK: mov {{w[0-9]+}}, #1 store i32 1, i32* @var32 ret void } define void @test13() { ; CHECK-LABEL: test13: -; CHECK: orr {{w[0-9]+}}, wzr, #0xffff +; CHECK: mov {{w[0-9]+}}, #65535 store i32 65535, i32* @var32 ret void } define void @test14() { ; CHECK-LABEL: test14: -; CHECK: orr {{w[0-9]+}}, wzr, #0x10000 +; CHECK: mov {{w[0-9]+}}, #65536 store i32 65536, i32* @var32 ret void } define void @test15() { ; CHECK-LABEL: test15: -; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000 +; CHECK: mov {{w[0-9]+}}, #-65536 store i32 4294901760, i32* @var32 ret void } @@ -119,6 +119,6 @@ define i64 @test17() { ; CHECK-LABEL: test17: ; Mustn't MOVN w0 here. -; CHECK: orr x0, xzr, #0xfffffffffffffffd +; CHECK: mov x0, #-3 ret i64 -3 } diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll index 9d7d0abbf6c7..24bc7289f3f1 100644 --- a/test/CodeGen/AArch64/neon-compare-instructions.ll +++ b/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -1146,7 +1146,7 @@ define <4 x i32> @cmhsz4xi32(<4 x i32> %A) { define <2 x i64> @cmhsz2xi64(<2 x i64> %A) { ; CHECK-LABEL: cmhsz2xi64: -; CHECK: orr w[[TWO:[0-9]+]], wzr, #0x2 +; CHECK: mov w[[TWO:[0-9]+]], #2 ; CHECK-NEXT: {{v[0-9]+}}.2d, x[[TWO]] ; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = icmp uge <2 x i64> %A, @@ -1211,7 +1211,7 @@ define <4 x i32> @cmhiz4xi32(<4 x i32> %A) { define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { ; CHECK-LABEL: cmhiz2xi64: -; CHECK: orr w[[ONE:[0-9]+]], wzr, #{{0x1|1}} +; CHECK: mov w[[ONE:[0-9]+]], #1 ; CHECK-NEXT: dup {{v[0-9]+}}.2d, x[[ONE]] ; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = icmp ugt <2 x i64> %A, @@ -1366,7 +1366,7 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) { ; CHECK-LABEL: cmloz2xi64: ; Using registers other than v0, v1 are possible, but would be odd. ; LO implemented as HI, so check reversed operands. -; CHECK: orr w[[TWO:[0-9]+]], wzr, #{{0x2|2}} +; CHECK: mov w[[TWO:[0-9]+]], #2 ; CHECK-NEXT: dup v1.2d, x[[TWO]] ; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d %tmp3 = icmp ult <2 x i64> %A, diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll index f960a3a95fc9..711bad7d82b5 100644 --- a/test/CodeGen/AArch64/optimize-imm.ll +++ b/test/CodeGen/AArch64/optimize-imm.ll @@ -52,7 +52,7 @@ entry: ; a BIC. ; CHECK-LABEL: xor1: -; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: mov [[R0:w[0-9]+]], #56 ; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 define i32 @xor1(i32 %a) { diff --git a/test/CodeGen/AArch64/redundant-copy-elim-empty-mbb.ll b/test/CodeGen/AArch64/redundant-copy-elim-empty-mbb.ll index 27a33a2337e3..ed34cbd2fa0b 100644 --- a/test/CodeGen/AArch64/redundant-copy-elim-empty-mbb.ll +++ b/test/CodeGen/AArch64/redundant-copy-elim-empty-mbb.ll @@ -9,7 +9,7 @@ declare i8* @bar() ; CHECK-LABEL: foo: ; CHECK: tbz -; CHECK: orr +; CHECK: mov{{.*}}, #1 ; CHECK: ret ; CHECK: bl bar ; CHECK: cbnz diff --git a/test/CodeGen/AArch64/sadd_sat.ll b/test/CodeGen/AArch64/sadd_sat.ll index 4c14b3c7327a..943458496f53 100644 --- a/test/CodeGen/AArch64/sadd_sat.ll +++ b/test/CodeGen/AArch64/sadd_sat.ll @@ -10,7 +10,7 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func: ; CHECK: // %bb.0: ; CHECK-NEXT: adds w8, w0, w1 -; CHECK-NEXT: orr w9, wzr, #0x7fffffff +; CHECK-NEXT: mov w9, #2147483647 ; CHECK-NEXT: cmp w8, #0 // =0 ; CHECK-NEXT: cinv w8, w9, ge ; CHECK-NEXT: adds w9, w0, w1 @@ -24,7 +24,7 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: func2: ; CHECK: // %bb.0: ; CHECK-NEXT: adds x8, x0, x1 -; CHECK-NEXT: orr x9, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x9, #9223372036854775807 ; CHECK-NEXT: cmp x8, #0 // =0 ; CHECK-NEXT: cinv x8, x9, ge ; CHECK-NEXT: adds x9, x0, x1 @@ -39,7 +39,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: lsl w8, w0, #28 ; CHECK-NEXT: adds w10, w8, w1, lsl #28 -; CHECK-NEXT: orr w9, wzr, #0x7fffffff +; CHECK-NEXT: mov w9, #2147483647 ; CHECK-NEXT: cmp w10, #0 // =0 ; CHECK-NEXT: cinv w9, w9, ge ; CHECK-NEXT: adds w8, w8, w1, lsl #28 diff --git a/test/CodeGen/AArch64/sadd_sat_vec.ll b/test/CodeGen/AArch64/sadd_sat_vec.ll index 831dd30ce0c0..40b945885f5b 100644 --- a/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -746,7 +746,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-NEXT: cmge v1.2d, v1.2d, #0 ; CHECK-NEXT: cmge v0.2d, v0.2d, #0 ; CHECK-NEXT: cmge v5.2d, v2.2d, #0 -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 ; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d ; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d @@ -765,7 +765,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.2d, v0.2d, v2.2d -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v6.2d, v4.2d, #0 ; CHECK-NEXT: dup v7.2d, x8 ; CHECK-NEXT: add v5.2d, v1.2d, v3.2d @@ -800,7 +800,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: add v16.2d, v0.2d, v4.2d -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: add v17.2d, v1.2d, v5.2d ; CHECK-NEXT: cmlt v20.2d, v16.2d, #0 ; CHECK-NEXT: dup v21.2d, x8 @@ -872,7 +872,7 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-NEXT: adcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: csinc w13, w13, wzr, ne ; CHECK-NEXT: cinv x14, x8, ge ; CHECK-NEXT: cmp w10, w13 diff --git a/test/CodeGen/AArch64/sdivpow2.ll b/test/CodeGen/AArch64/sdivpow2.ll index dd1c21b75b0c..158a778f0e91 100644 --- a/test/CodeGen/AArch64/sdivpow2.ll +++ b/test/CodeGen/AArch64/sdivpow2.ll @@ -77,7 +77,7 @@ define i64 @test6(i64 %x) { define i64 @test7(i64 %x) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, xzr, #0xffffffffffff +; CHECK-NEXT: mov x8, #281474976710655 ; CHECK-NEXT: add x8, x0, x8 ; CHECK-NEXT: cmp x0, #0 // =0 ; CHECK-NEXT: csel x8, x8, x0, lt diff --git a/test/CodeGen/AArch64/seh-finally.ll b/test/CodeGen/AArch64/seh-finally.ll index b6dfc6e85677..b7027fb4359f 100644 --- a/test/CodeGen/AArch64/seh-finally.ll +++ b/test/CodeGen/AArch64/seh-finally.ll @@ -36,7 +36,7 @@ define void @simple_seh() #0 personality i8* bitcast (i32 (...)* @__C_specific_h entry: ; CHECK-LABEL: simple_seh ; CHECK: add x29, sp, #16 -; CHECK: orr x0, xzr, #0xfffffffffffffffe +; CHECK: mov x0, #-2 ; CHECK: stur x0, [x29, #-16] ; CHECK: .set .Lsimple_seh$frame_escape_0, -8 ; CHECK: ldur w0, [x29, #-8] @@ -90,7 +90,7 @@ entry: ; CHECK: sub x9, sp, #64 ; CHECK: and sp, x9, #0xffffffffffffffe0 ; CHECK: mov x19, sp -; CHECK: orr x0, xzr, #0xfffffffffffffffe +; CHECK: mov x0, #-2 ; CHECK: stur x0, [x19, #16] ; CHECK: .set .Lstack_realign$frame_escape_0, 32 ; CHECK: ldr w0, [x19, #32] @@ -141,7 +141,7 @@ define void @vla_present(i32 %n) #0 personality i8* bitcast (i32 (...)* @__C_spe entry: ; CHECK-LABEL: vla_present ; CHECK: add x29, sp, #32 -; CHECK: orr x1, xzr, #0xfffffffffffffffe +; CHECK: mov x1, #-2 ; CHECK: stur x1, [x29, #-32] ; CHECK: .set .Lvla_present$frame_escape_0, -4 ; CHECK: stur w0, [x29, #-4] @@ -209,7 +209,7 @@ entry: ; CHECK: sub x9, sp, #64 ; CHECK: and sp, x9, #0xffffffffffffffe0 ; CHECK: mov x19, sp -; CHECK: orr x1, xzr, #0xfffffffffffffffe +; CHECK: mov x1, #-2 ; CHECK: stur x1, [x19] ; CHECK: .set .Lvla_and_realign$frame_escape_0, 32 ; CHECK: stur w0, [x29, #-4] diff --git a/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/test/CodeGen/AArch64/selectcc-to-shiftand.ll index 99190633547c..6735a1e0bc03 100644 --- a/test/CodeGen/AArch64/selectcc-to-shiftand.ll +++ b/test/CodeGen/AArch64/selectcc-to-shiftand.ll @@ -75,7 +75,7 @@ define i32 @pos_sel_constants(i32 %a) { define i32 @pos_sel_special_constant(i32 %a) { ; CHECK-LABEL: pos_sel_special_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x200 +; CHECK-NEXT: mov w8, #512 ; CHECK-NEXT: bic w0, w8, w0, lsr #22 ; CHECK-NEXT: ret ; diff --git a/test/CodeGen/AArch64/signed-truncation-check.ll b/test/CodeGen/AArch64/signed-truncation-check.ll index edd61b10d002..e976144861db 100644 --- a/test/CodeGen/AArch64/signed-truncation-check.ll +++ b/test/CodeGen/AArch64/signed-truncation-check.ll @@ -381,7 +381,7 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind { define i1 @add_ulecmp_bad_i16_i8(i16 %x) nounwind { ; CHECK-LABEL: add_ulecmp_bad_i16_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x1 +; CHECK-NEXT: mov w0, #1 ; CHECK-NEXT: ret %tmp0 = add i16 %x, 128 ; 1U << (8-1) %tmp1 = icmp ule i16 %tmp0, -1 ; when we +1 it, it will wrap to 0 diff --git a/test/CodeGen/AArch64/ssub_sat.ll b/test/CodeGen/AArch64/ssub_sat.ll index e0d2708aada3..9dc200523d2d 100644 --- a/test/CodeGen/AArch64/ssub_sat.ll +++ b/test/CodeGen/AArch64/ssub_sat.ll @@ -10,7 +10,7 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func: ; CHECK: // %bb.0: ; CHECK-NEXT: subs w8, w0, w1 -; CHECK-NEXT: orr w9, wzr, #0x7fffffff +; CHECK-NEXT: mov w9, #2147483647 ; CHECK-NEXT: cmp w8, #0 // =0 ; CHECK-NEXT: cinv w8, w9, ge ; CHECK-NEXT: subs w9, w0, w1 @@ -24,7 +24,7 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: func2: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x1 -; CHECK-NEXT: orr x9, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x9, #9223372036854775807 ; CHECK-NEXT: cmp x8, #0 // =0 ; CHECK-NEXT: cinv x8, x9, ge ; CHECK-NEXT: subs x9, x0, x1 @@ -39,7 +39,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: lsl w8, w0, #28 ; CHECK-NEXT: subs w10, w8, w1, lsl #28 -; CHECK-NEXT: orr w9, wzr, #0x7fffffff +; CHECK-NEXT: mov w9, #2147483647 ; CHECK-NEXT: cmp w10, #0 // =0 ; CHECK-NEXT: cinv w9, w9, ge ; CHECK-NEXT: subs w8, w8, w1, lsl #28 diff --git a/test/CodeGen/AArch64/ssub_sat_vec.ll b/test/CodeGen/AArch64/ssub_sat_vec.ll index 03925364fefd..3718b6bbc9de 100644 --- a/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -781,7 +781,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-NEXT: cmge v1.2d, v1.2d, #0 ; CHECK-NEXT: cmge v0.2d, v0.2d, #0 ; CHECK-NEXT: cmge v5.2d, v2.2d, #0 -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v3.2d, v2.2d, #0 ; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d ; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d @@ -801,7 +801,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v4.2d, v0.2d, v2.2d -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: cmlt v6.2d, v4.2d, #0 ; CHECK-NEXT: dup v7.2d, x8 ; CHECK-NEXT: sub v5.2d, v1.2d, v3.2d @@ -838,7 +838,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sub v16.2d, v0.2d, v4.2d -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: sub v17.2d, v1.2d, v5.2d ; CHECK-NEXT: cmlt v20.2d, v16.2d, #0 ; CHECK-NEXT: dup v21.2d, x8 @@ -914,7 +914,7 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-NEXT: sbcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: orr x8, xzr, #0x7fffffffffffffff +; CHECK-NEXT: mov x8, #9223372036854775807 ; CHECK-NEXT: csinc w13, w13, wzr, ne ; CHECK-NEXT: cinv x14, x8, ge ; CHECK-NEXT: cmp w10, w13 diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll index 55237b2f40f7..734a2016338b 100644 --- a/test/CodeGen/AArch64/swifterror.ll +++ b/test/CodeGen/AArch64/swifterror.ll @@ -9,19 +9,19 @@ declare void @free(i8*) ; that takes a swifterror parameter and "caller" is the caller of "foo". define float @foo(%swift_error** swifterror %error_ptr_ref) { ; CHECK-APPLE-LABEL: foo: -; CHECK-APPLE: orr w0, wzr, #0x10 +; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc -; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-APPLE: mov [[ID:w[0-9]+]], #1 ; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE-NOT: x21 ; CHECK-O0-LABEL: foo: -; CHECK-O0: orr w{{.*}}, wzr, #0x10 +; CHECK-O0: mov w{{.*}}, #16 ; CHECK-O0: malloc ; CHECK-O0: mov x1, x0 ; CHECK-O0-NOT: x1 -; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-O0: mov [[ID:w[0-9]+]], #1 ; CHECK-O0: strb [[ID]], [x0, #8] ; CHECK-O0: mov x21, x1 entry: @@ -118,9 +118,9 @@ handler: define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) { ; CHECK-APPLE-LABEL: foo_if: ; CHECK-APPLE: cbz w0 -; CHECK-APPLE: orr w0, wzr, #0x10 +; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc -; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-APPLE: mov [[ID:w[0-9]+]], #1 ; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE-NOT: x21 @@ -130,10 +130,10 @@ define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) { ; spill x21 ; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]] ; CHECK-O0: cbz w0 -; CHECK-O0: orr w{{.*}}, wzr, #0x10 +; CHECK-O0: mov w{{.*}}, #16 ; CHECK-O0: malloc ; CHECK-O0: mov [[ID:x[0-9]+]], x0 -; CHECK-O0: orr [[ID2:w[0-9]+]], wzr, #0x1 +; CHECK-O0: mov [[ID2:w[0-9]+]], #1 ; CHECK-O0: strb [[ID2]], [x0, #8] ; CHECK-O0: mov x21, [[ID]] ; CHECK-O0: ret @@ -163,7 +163,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float ; CHECK-APPLE-LABEL: foo_loop: ; CHECK-APPLE: mov x0, x21 ; CHECK-APPLE: cbz -; CHECK-APPLE: orr w0, wzr, #0x10 +; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc ; CHECK-APPLE: strb w{{.*}}, [x0, #8] ; CHECK-APPLE: fcmp @@ -179,7 +179,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float ; CHECK-O0: ldr x0, [sp, [[SLOT]]] ; CHECK-O0: str x0, [sp, [[SLOT2:#[0-9]+]]] ; CHECK-O0: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]] -; CHECK-O0: orr w{{.*}}, wzr, #0x10 +; CHECK-O0: mov w{{.*}}, #16 ; CHECK-O0: malloc ; CHECK-O0: mov [[ID:x[0-9]+]], x0 ; CHECK-O0: strb w{{.*}}, [{{.*}}[[ID]], #8] @@ -223,22 +223,22 @@ bb_end: define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) { ; CHECK-APPLE-LABEL: foo_sret: ; CHECK-APPLE: mov [[SRET:x[0-9]+]], x8 -; CHECK-APPLE: orr w0, wzr, #0x10 +; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc -; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-APPLE: mov [[ID:w[0-9]+]], #1 ; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE: str w{{.*}}, [{{.*}}[[SRET]], #4] ; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE-NOT: x21 ; CHECK-O0-LABEL: foo_sret: -; CHECK-O0: orr w{{.*}}, wzr, #0x10 +; CHECK-O0: mov w{{.*}}, #16 ; spill x8 ; CHECK-O0-DAG: str x8 ; spill x21 ; CHECK-O0-DAG: str x21 ; CHECK-O0: malloc -; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-O0: mov [[ID:w[0-9]+]], #1 ; CHECK-O0: strb [[ID]], [x0, #8] ; reload from stack ; CHECK-O0: ldr [[SRET:x[0-9]+]] @@ -306,9 +306,9 @@ handler: declare void @llvm.va_start(i8*) nounwind define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) { ; CHECK-APPLE-LABEL: foo_vararg: -; CHECK-APPLE: orr w0, wzr, #0x10 +; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc -; CHECK-APPLE-DAG: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-APPLE-DAG: mov [[ID:w[0-9]+]], #1 ; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16 ; CHECK-APPLE-DAG: strb [[ID]], [x0, #8] @@ -439,14 +439,14 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) { ; CHECK-APPLE: mov x19, x1 ; CHECK-APPLE: mov x22, x0 ; Setup call. -; CHECK-APPLE: orr w0, wzr, #0x1 -; CHECK-APPLE: orr w1, wzr, #0x2 -; CHECK-APPLE: orr w2, wzr, #0x3 -; CHECK-APPLE: orr w3, wzr, #0x4 +; CHECK-APPLE: mov w0, #1 +; CHECK-APPLE: mov w1, #2 +; CHECK-APPLE: mov w2, #3 +; CHECK-APPLE: mov w3, #4 ; CHECK-APPLE: mov w4, #5 -; CHECK-APPLE: orr w5, wzr, #0x6 -; CHECK-APPLE: orr w6, wzr, #0x7 -; CHECK-APPLE: orr w7, wzr, #0x8 +; CHECK-APPLE: mov w5, #6 +; CHECK-APPLE: mov w6, #7 +; CHECK-APPLE: mov w7, #8 ; CHECK-APPLE: mov x20, xzr ; CHECK-APPLE: mov x21, xzr ; CHECK-APPLE: bl _params_in_reg2 @@ -505,14 +505,14 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8* ; CHECK-APPLE: mov x19, x1 ; CHECK-APPLE: mov x22, x0 ; Setup call arguments. -; CHECK-APPLE: orr w0, wzr, #0x1 -; CHECK-APPLE: orr w1, wzr, #0x2 -; CHECK-APPLE: orr w2, wzr, #0x3 -; CHECK-APPLE: orr w3, wzr, #0x4 +; CHECK-APPLE: mov w0, #1 +; CHECK-APPLE: mov w1, #2 +; CHECK-APPLE: mov w2, #3 +; CHECK-APPLE: mov w3, #4 ; CHECK-APPLE: mov w4, #5 -; CHECK-APPLE: orr w5, wzr, #0x6 -; CHECK-APPLE: orr w6, wzr, #0x7 -; CHECK-APPLE: orr w7, wzr, #0x8 +; CHECK-APPLE: mov w5, #6 +; CHECK-APPLE: mov w6, #7 +; CHECK-APPLE: mov w7, #8 ; CHECK-APPLE: mov x20, xzr ; CHECK-APPLE: mov x21, xzr ; CHECK-APPLE: bl _params_in_reg2 @@ -541,15 +541,15 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8* ; Save swifterror %err. ; CHECK-APPLE: str x21, [sp, #24] ; Setup call. -; CHECK-APPLE: orr w0, wzr, #0x1 -; CHECK-APPLE: orr w1, wzr, #0x2 -; CHECK-APPLE: orr w2, wzr, #0x3 -; CHECK-APPLE: orr w3, wzr, #0x4 +; CHECK-APPLE: mov w0, #1 +; CHECK-APPLE: mov w1, #2 +; CHECK-APPLE: mov w2, #3 +; CHECK-APPLE: mov w3, #4 ; CHECK-APPLE: mov w4, #5 -; CHECK-APPLE: orr w5, wzr, #0x6 -; CHECK-APPLE: orr w6, wzr, #0x7 -; CHECK-APPLE: orr w7, wzr, #0x8 -; CHECK-APPLE: mov x20, xzr +; CHECK-APPLE: mov w5, #6 +; CHECK-APPLE: mov w6, #7 +; CHECK-APPLE: mov w7, #8 +; CHECK-APPLE: mov x20, xzr ; ... setup call with swiferror %error_ptr_ref. ; CHECK-APPLE: ldr x21, [sp, #8] ; CHECK-APPLE: bl _params_in_reg2 diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll index 2472bf45b6a9..5c9778c6ff83 100644 --- a/test/CodeGen/AArch64/tst-br.ll +++ b/test/CodeGen/AArch64/tst-br.ll @@ -36,13 +36,13 @@ test3: ; CHECK: tbz {{[wx][0-9]+}}, #12, [[LBL_end1]] end2: -; CHECK: {{movz x0, #1|orr w0, wzr, #0x1}} +; CHECK: mov w0, #1 ; CHECK-NEXT: ret ret i32 1 end1: ; CHECK: [[LBL_end1]]: -; CHECK-NEXT: {{mov x0, xzr|mov w0, wzr}} +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret ret i32 0 } diff --git a/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index 57d41d6fa891..1da7c0230e3c 100644 --- a/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -172,9 +172,7 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_div_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -207,9 +205,7 @@ define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_both_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/test/CodeGen/AArch64/urem-seteq.ll b/test/CodeGen/AArch64/urem-seteq.ll index ee85dad6fb78..c5483a220c4e 100644 --- a/test/CodeGen/AArch64/urem-seteq.ll +++ b/test/CodeGen/AArch64/urem-seteq.ll @@ -75,7 +75,7 @@ define i16 @test_urem_even(i16 %X) nounwind readnone { ; CHECK-NEXT: umull x9, w9, w10 ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: orr w10, wzr, #0xe +; CHECK-NEXT: mov w10, #14 ; CHECK-NEXT: msub w8, w9, w10, w8 ; CHECK-NEXT: cmp w8, #0 // =0 ; CHECK-NEXT: cset w0, ne @@ -129,7 +129,7 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { define i32 @test_urem_one(i32 %X) nounwind readnone { ; CHECK-LABEL: test_urem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w0, wzr, #0x1 +; CHECK-NEXT: mov w0, #1 ; CHECK-NEXT: ret %urem = urem i32 %X, 1 %cmp = icmp eq i32 %urem, 0 diff --git a/test/CodeGen/AArch64/vec_cttz.ll b/test/CodeGen/AArch64/vec_cttz.ll index 68efa0de9f8d..39e0301677da 100644 --- a/test/CodeGen/AArch64/vec_cttz.ll +++ b/test/CodeGen/AArch64/vec_cttz.ll @@ -54,7 +54,7 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %a) nounwind { define <1 x i64> @cttz_v1i64(<1 x i64> %a) nounwind { ; CHECK-LABEL: cttz_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: sub d1, d0, d1 ; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b @@ -110,7 +110,7 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind { define <2 x i64> @cttz_v2i64(<2 x i64> %a) nounwind { ; CHECK-LABEL: cttz_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d ; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b diff --git a/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 44260d536322..90768bb5351d 100644 --- a/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -47,7 +47,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { define float @test_v3f32(<3 x float> %a) nounwind { ; CHECK-LABEL: test_v3f32: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x7f800000 +; CHECK-NEXT: mov w8, #2139095040 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fmaxnmv s0, v0.4s diff --git a/test/CodeGen/AArch64/wineh-try-catch-cbz.ll b/test/CodeGen/AArch64/wineh-try-catch-cbz.ll index 7c64328f0a7d..d84c07f8bc1a 100644 --- a/test/CodeGen/AArch64/wineh-try-catch-cbz.ll +++ b/test/CodeGen/AArch64/wineh-try-catch-cbz.ll @@ -7,7 +7,7 @@ ; CHECK: sub sp, sp, #32 ; CHECK-NEXT: stp x29, x30, [sp, #16] ; CHECK-NEXT: add x29, sp, #16 -; CHECK-NEXT: orr x1, xzr, #0xfffffffffffffffe +; CHECK-NEXT: mov x1, #-2 ; CHECK-NEXT: stur x1, [x29, #-16] ; CHECK-NEXT: cbz w0, .LBB0_2 diff --git a/test/CodeGen/AArch64/wineh-try-catch.ll b/test/CodeGen/AArch64/wineh-try-catch.ll index f4bb9d50a434..7185637a1175 100644 --- a/test/CodeGen/AArch64/wineh-try-catch.ll +++ b/test/CodeGen/AArch64/wineh-try-catch.ll @@ -22,12 +22,12 @@ ; CHECK: add x29, sp, #32 ; CHECK: sub sp, sp, #624 ; CHECK: mov x19, sp -; CHECK: orr x0, xzr, #0xfffffffffffffffe +; CHECK: mov x0, #-2 ; CHECK: stur x0, [x19] ; Now check that x is stored at fp - 20. We check that this is the same ; location accessed from the funclet to retrieve x. -; CHECK: orr w8, wzr, #0x1 +; CHECK: mov w8, #1 ; CHECK: stur w8, [x29, [[X_OFFSET:#-[1-9][0-9]+]] ; Check the offset off the frame pointer at which B is located. diff --git a/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/test/CodeGen/AMDGPU/atomicrmw-nand.ll index 7af33416e170..3d457fdd50e8 100644 --- a/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -12,8 +12,10 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] diff --git a/test/CodeGen/AMDGPU/collapse-endcf.ll b/test/CodeGen/AMDGPU/collapse-endcf.ll index 76b8f2d42da3..23dc712d5c92 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}simple_nested_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] @@ -9,7 +9,9 @@ ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -29,6 +31,7 @@ bb.inner.then: ; preds = %bb.outer.then br label %bb.outer.end bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb + store i32 3, i32 addrspace(3)* null ret void } @@ -44,7 +47,9 @@ bb.outer.end: ; preds = %bb.outer.then, %bb. ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -70,6 +75,7 @@ bb.inner.end: ; preds = %bb.inner.then, %bb. br label %bb.outer.end bb.outer.end: ; preds = %bb.inner.then, %bb + store i32 3, i32 addrspace(3)* null ret void } @@ -88,7 +94,9 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -114,6 +122,7 @@ bb.else: ; preds = %bb.outer.then br label %bb.outer.end bb.outer.end: ; preds = %bb, %bb.then, %bb.else + store i32 3, i32 addrspace(3)* null ret void } @@ -138,11 +147,15 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] +; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9_]+]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword +; GCN-NEXT: [[FLOW1]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]] ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -174,6 +187,7 @@ bb.inner.then2: br label %bb.outer.end bb.outer.end: + store i32 3, i32 addrspace(3)* null ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 18abf607aea5..25874cd72622 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -27,9 +27,9 @@ bb: %tmp1 = zext i32 %tmp to i64 %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4 - fence syncscope("workgroup") release + fence syncscope("workgroup-one-as") release tail call void @llvm.amdgcn.s.barrier() - fence syncscope("workgroup") acquire + fence syncscope("workgroup-one-as") acquire %tmp4 = add nsw i32 %tmp3, %tmp3 %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false) %tmp6 = add nsw i32 %tmp5, %tmp4 diff --git a/test/CodeGen/AMDGPU/local-atomics-fp.ll b/test/CodeGen/AMDGPU/local-atomics-fp.ll index 17c45c1a2f44..9ba655592859 100644 --- a/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -34,7 +34,7 @@ define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind { ; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 ; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(1) +; HAS-ATOMICS: s_waitcnt vmcnt(0) lgkmcnt(0) ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 @@ -49,6 +49,27 @@ define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace ret void } +; GCN-LABEL: {{^}}lds_ds_fadd_one_as: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; HAS-ATOMICS: s_waitcnt lgkmcnt(1) +; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = atomicrmw fadd float addrspace(3)* %ptr0, float 4.2e+1 syncscope("one-as") seq_cst + %a2 = atomicrmw fadd float addrspace(3)* %ptr1, float 4.2e+1 syncscope("one-as") seq_cst + %a3 = atomicrmw fadd float addrspace(3)* %ptrf, float %a1 syncscope("one-as") seq_cst + store float %a3, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64: ; GCN: ds_read_b64 ; GCN: v_add_f64 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll index f692c763c0b9..431ca021a779 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}system_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { @@ -18,7 +18,7 @@ entry: ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -29,7 +29,7 @@ entry: ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { @@ -41,7 +41,7 @@ entry: ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { @@ -50,6 +50,53 @@ entry: ret void } +; FUNC-LABEL: {{^}}system_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}singlethread_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE @@ -90,10 +137,50 @@ entry: ret void } +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -105,7 +192,7 @@ entry: ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -116,7 +203,7 @@ entry: ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { @@ -128,7 +215,7 @@ entry: ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { @@ -137,9 +224,56 @@ entry: ret void } +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -150,7 +284,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -161,7 +295,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -172,7 +306,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { @@ -181,6 +315,50 @@ entry: ret void } +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}wavefront_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE @@ -220,3 +398,43 @@ entry: fence syncscope("wavefront") seq_cst ret void } + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll b/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll index 733540537990..cf07eb2035b1 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -2,9 +2,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; GCN-LABEL: {{^}}system_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { @@ -15,9 +15,9 @@ entry: } ; GCN-LABEL: {{^}}system_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -28,9 +28,9 @@ entry: } ; GCN-LABEL: {{^}}system_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -41,9 +41,9 @@ entry: } ; GCN-LABEL: {{^}}system_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -54,9 +54,9 @@ entry: } ; GCN-LABEL: {{^}}system_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -67,9 +67,9 @@ entry: } ; GCN-LABEL: {{^}}system_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -80,9 +80,9 @@ entry: } ; GCN-LABEL: {{^}}system_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -93,9 +93,9 @@ entry: } ; GCN-LABEL: {{^}}system_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -106,9 +106,9 @@ entry: } ; GCN-LABEL: {{^}}system_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -119,9 +119,9 @@ entry: } ; GCN-LABEL: {{^}}system_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -131,11 +131,141 @@ entry: ret void } -; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -145,9 +275,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -158,9 +288,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -171,9 +301,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -184,9 +314,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -197,9 +327,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -210,9 +340,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -223,9 +353,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -236,9 +366,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -249,9 +379,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -261,11 +391,141 @@ entry: ret void } -; GCN-LABEL: {{^}}agent_monotonic_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -275,9 +535,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -288,9 +548,9 @@ entry: } ; GCN-LABEL: {{^}}agent_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -301,9 +561,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -314,9 +574,9 @@ entry: } ; GCN-LABEL: {{^}}agent_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -327,9 +587,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -340,9 +600,9 @@ entry: } ; GCN-LABEL: {{^}}agent_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -353,9 +613,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -366,9 +626,9 @@ entry: } ; GCN-LABEL: {{^}}agent_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -379,9 +639,9 @@ entry: } ; GCN-LABEL: {{^}}agent_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -391,11 +651,141 @@ entry: ret void } -; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -405,9 +795,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -418,9 +808,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -431,9 +821,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -444,9 +834,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -457,9 +847,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -470,9 +860,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -483,9 +873,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -496,9 +886,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -509,9 +899,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -521,11 +911,141 @@ entry: ret void } -; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -535,9 +1055,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -548,9 +1068,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -561,9 +1081,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -574,9 +1094,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -587,9 +1107,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -600,9 +1120,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -613,9 +1133,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -626,9 +1146,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -639,9 +1159,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -650,3 +1170,133 @@ entry: %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } + +; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll b/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll index 02104826e6a1..12ed53fd21fa 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s -; FUNC-LABEL: {{^}}system_acquire: +; FUNC-LABEL: {{^}}system_one_as_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GFX6: s_waitcnt vmcnt(0){{$}} @@ -10,6 +10,232 @@ ; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { entry: fence acquire @@ -19,7 +245,7 @@ entry: ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -30,7 +256,7 @@ entry: ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -43,7 +269,7 @@ entry: ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -96,9 +322,9 @@ entry: ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -110,7 +336,7 @@ entry: ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -121,7 +347,7 @@ entry: ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -134,7 +360,7 @@ entry: ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -146,7 +372,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -157,7 +383,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -168,7 +394,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -179,7 +405,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir index 5582a4f93fd4..60ca3166356d 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -104,7 +104,7 @@ body: | S_WAITCNT 127 $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec S_WAITCNT 3952 - BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load seq_cst 4 from %ir.gep) + BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from %ir.gep) bb.2.exit: liveins: $sgpr2_sgpr3 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll b/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll index 7bf4b93ec843..ad1f0587036f 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -1,11 +1,311 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic( i32* %out, i32 %in) { entry: @@ -14,9 +314,9 @@ entry: } ; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire( i32* %out, i32 %in) { @@ -26,9 +326,9 @@ entry: } ; GCN-LABEL: {{^}}system_release: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release( i32* %out, i32 %in) { @@ -38,9 +338,9 @@ entry: } ; GCN-LABEL: {{^}}system_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel( i32* %out, i32 %in) { @@ -50,9 +350,9 @@ entry: } ; GCN-LABEL: {{^}}system_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst( i32* %out, i32 %in) { @@ -62,9 +362,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic( i32* %out, i32 %in) { @@ -74,9 +374,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire( i32* %out, i32 %in) { @@ -86,9 +386,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release( i32* %out, i32 %in) { @@ -98,9 +398,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel( i32* %out, i32 %in) { @@ -110,9 +410,9 @@ entry: } ; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst( i32* %out, i32 %in) { @@ -122,9 +422,9 @@ entry: } ; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic( i32* %out, i32 %in) { @@ -134,9 +434,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire( i32* %out, i32 %in) { @@ -146,9 +446,9 @@ entry: } ; GCN-LABEL: {{^}}agent_release: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release( i32* %out, i32 %in) { @@ -158,9 +458,9 @@ entry: } ; GCN-LABEL: {{^}}agent_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel( i32* %out, i32 %in) { @@ -170,9 +470,9 @@ entry: } ; GCN-LABEL: {{^}}agent_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst( i32* %out, i32 %in) { @@ -182,9 +482,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic( i32* %out, i32 %in) { @@ -194,9 +494,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire( i32* %out, i32 %in) { @@ -206,9 +506,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_release: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release( i32* %out, i32 %in) { @@ -218,9 +518,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel( i32* %out, i32 %in) { @@ -230,9 +530,9 @@ entry: } ; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst( i32* %out, i32 %in) { @@ -242,9 +542,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic( i32* %out, i32 %in) { @@ -254,9 +554,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire( i32* %out, i32 %in) { @@ -266,9 +566,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release( i32* %out, i32 %in) { @@ -278,9 +578,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel( i32* %out, i32 %in) { @@ -290,9 +590,9 @@ entry: } ; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst( i32* %out, i32 %in) { diff --git a/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir b/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir index 44af0f65ab3f..de13190d9834 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir @@ -11,7 +11,7 @@ body: | $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec - renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load seq_cst 4 from `i32 addrspace(42)* undef`) + renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -30,7 +30,7 @@ body: | $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(42)* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -47,7 +47,7 @@ body: | $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -63,7 +63,7 @@ body: | $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront") seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/test/CodeGen/AMDGPU/memory-legalizer-load.ll index 179cb3f625d9..9f7a8ef53e60 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -5,282 +5,282 @@ declare i32 @llvm.amdgcn.workitem.id.x() -; GCN-LABEL: {{^}}system_unordered: +; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_unordered( +define amdgpu_kernel void @system_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in unordered, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_monotonic( +define amdgpu_kernel void @system_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_acquire: +; GCN-LABEL: {{^}}system_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_acquire( +define amdgpu_kernel void @system_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in acquire, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_seq_cst: +; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_seq_cst( +define amdgpu_kernel void @system_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_unordered( +define amdgpu_kernel void @singlethread_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_monotonic( +define amdgpu_kernel void @singlethread_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_acquire: +; GCN-LABEL: {{^}}singlethread_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_acquire( +define amdgpu_kernel void @singlethread_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_seq_cst( +define amdgpu_kernel void @singlethread_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_unordered: +; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_unordered( +define amdgpu_kernel void @agent_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_monotonic( +define amdgpu_kernel void @agent_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_acquire: +; GCN-LABEL: {{^}}agent_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_acquire( +define amdgpu_kernel void @agent_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_seq_cst: +; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_seq_cst( +define amdgpu_kernel void @agent_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_unordered( +define amdgpu_kernel void @workgroup_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_monotonic( +define amdgpu_kernel void @workgroup_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_acquire: +; GCN-LABEL: {{^}}workgroup_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_acquire( +define amdgpu_kernel void @workgroup_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_seq_cst( +define amdgpu_kernel void @workgroup_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_unordered( +define amdgpu_kernel void @wavefront_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_monotonic( +define amdgpu_kernel void @wavefront_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_acquire: +; GCN-LABEL: {{^}}wavefront_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_acquire( +define amdgpu_kernel void @wavefront_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_seq_cst( +define amdgpu_kernel void @wavefront_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } @@ -374,4 +374,284 @@ entry: ret void } +; GCN-LABEL: {{^}}system_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + !0 = !{i32 1} diff --git a/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/test/CodeGen/AMDGPU/memory-legalizer-local.mir index ffaab97174fd..721c2844e91f 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -17,7 +17,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -41,7 +41,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -65,7 +65,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -89,7 +89,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -113,7 +113,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -137,7 +137,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -161,7 +161,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -185,7 +185,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -209,7 +209,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -233,7 +233,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -257,7 +257,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -281,7 +281,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -305,7 +305,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -329,7 +329,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -353,7 +353,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -377,7 +377,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -401,7 +401,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -425,7 +425,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -449,7 +449,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -473,7 +473,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -498,7 +498,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -520,7 +520,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -542,7 +542,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -564,7 +564,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -586,7 +586,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -608,7 +608,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -630,7 +630,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -652,7 +652,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -674,7 +674,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -696,7 +696,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -718,7 +718,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -740,7 +740,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -762,7 +762,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -784,7 +784,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -806,7 +806,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -828,7 +828,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -850,7 +850,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -872,7 +872,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -894,7 +894,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -916,7 +916,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -938,7 +938,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -960,7 +960,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -982,7 +982,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1004,7 +1004,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1026,7 +1026,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1048,7 +1048,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll index 44b4d19dbb1f..fc598a66849a 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll @@ -3,10 +3,228 @@ ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s -; FUNC-LABEL: {{^}}system_acquire: +; FUNC-LABEL: {{^}}system_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { @@ -18,7 +236,7 @@ entry: ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -29,7 +247,7 @@ entry: ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { @@ -41,7 +259,7 @@ entry: ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { @@ -93,7 +311,7 @@ entry: ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -105,7 +323,7 @@ entry: ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -116,7 +334,7 @@ entry: ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { @@ -128,7 +346,7 @@ entry: ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { @@ -139,7 +357,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -150,7 +368,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -161,7 +379,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -172,7 +390,7 @@ entry: ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir index 5d19c6a38cd2..c2b84d12188b 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -55,7 +55,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/test/CodeGen/AMDGPU/memory-legalizer-region.mir index 2033a99804a4..91a12c8dd147 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -17,7 +17,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -41,7 +41,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -65,7 +65,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -89,7 +89,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -113,7 +113,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -137,7 +137,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -161,7 +161,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -185,7 +185,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -209,7 +209,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -233,7 +233,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -257,7 +257,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -281,7 +281,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -305,7 +305,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -329,7 +329,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -353,7 +353,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -377,7 +377,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -401,7 +401,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -425,7 +425,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -449,7 +449,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -473,7 +473,7 @@ body: | $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -498,7 +498,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -520,7 +520,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -542,7 +542,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -564,7 +564,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -586,7 +586,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -608,7 +608,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -630,7 +630,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -652,7 +652,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -674,7 +674,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -696,7 +696,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -718,7 +718,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -740,7 +740,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -762,7 +762,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... --- @@ -783,7 +783,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -805,7 +805,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -827,7 +827,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -893,7 +893,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -915,7 +915,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -937,7 +937,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -959,7 +959,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -981,7 +981,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1003,7 +1003,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1025,7 +1025,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1047,7 +1047,7 @@ body: | $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/test/CodeGen/AMDGPU/memory-legalizer-store.ll index 87c43949df6a..2501a606a134 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -5,203 +5,203 @@ declare i32 @llvm.amdgcn.workitem.id.x() -; GCN-LABEL: {{^}}system_unordered: +; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_unordered( +define amdgpu_kernel void @system_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out unordered, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_monotonic( +define amdgpu_kernel void @system_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}system_release: +; GCN-LABEL: {{^}}system_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_release( +define amdgpu_kernel void @system_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out release, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}system_seq_cst: +; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_seq_cst( +define amdgpu_kernel void @system_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_unordered( +define amdgpu_kernel void @singlethread_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_monotonic( +define amdgpu_kernel void @singlethread_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_release: +; GCN-LABEL: {{^}}singlethread_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_release( +define amdgpu_kernel void @singlethread_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_seq_cst( +define amdgpu_kernel void @singlethread_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}agent_unordered: +; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_unordered( +define amdgpu_kernel void @agent_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}agent_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_monotonic( +define amdgpu_kernel void @agent_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}agent_release: +; GCN-LABEL: {{^}}agent_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_release( +define amdgpu_kernel void @agent_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") release, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}agent_seq_cst: +; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_seq_cst( +define amdgpu_kernel void @agent_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_unordered( +define amdgpu_kernel void @workgroup_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_monotonic( +define amdgpu_kernel void @workgroup_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_release: +; GCN-LABEL: {{^}}workgroup_one_as_release: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_release( +define amdgpu_kernel void @workgroup_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_seq_cst( +define amdgpu_kernel void @workgroup_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_unordered( +define amdgpu_kernel void @wavefront_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_monotonic( +define amdgpu_kernel void @wavefront_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_release: +; GCN-LABEL: {{^}}wavefront_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_release( +define amdgpu_kernel void @wavefront_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_seq_cst( +define amdgpu_kernel void @wavefront_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 ret void } @@ -295,4 +295,204 @@ entry: ret void } +; GCN-LABEL: {{^}}system_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_release: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out release, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_release: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_release: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_release: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_release: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + !0 = !{i32 1} diff --git a/test/CodeGen/SPARC/missinglabel.ll b/test/CodeGen/SPARC/missinglabel.ll index dd73a5650924..60a3641ccea0 100644 --- a/test/CodeGen/SPARC/missinglabel.ll +++ b/test/CodeGen/SPARC/missinglabel.ll @@ -4,7 +4,7 @@ target datalayout = "E-m:e-i64:64-n32:64-S128" target triple = "sparc64-unknown-linux-gnu" -define void @f() align 2 { +define void @f(i64 %a0) align 2 { ; CHECK-LABEL: f: ; CHECK: .cfi_startproc ; CHECK-NEXT: ! %bb.0: ! %entry @@ -22,7 +22,7 @@ define void @f() align 2 { ; CHECK-NEXT: .LBB0_1: ! %cond.false ; CHECK-NEXT: .LBB0_4: ! %exit.i85 entry: - %cmp = icmp eq i64 undef, 0 + %cmp = icmp eq i64 %a0, 0 br i1 %cmp, label %targetblock, label %cond.false cond.false: diff --git a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll index 5300bed0de88..ac1c814b838e 100644 --- a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll +++ b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll @@ -8,10 +8,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; THUMBV7-NEXT: .pad #44 ; THUMBV7-NEXT: sub sp, #44 -; THUMBV7-NEXT: str r0, [sp, #40] @ 4-byte Spill -; THUMBV7-NEXT: movs r0, #0 ; THUMBV7-NEXT: ldrd r4, r7, [sp, #88] ; THUMBV7-NEXT: mov r5, r3 +; THUMBV7-NEXT: str r0, [sp, #40] @ 4-byte Spill +; THUMBV7-NEXT: movs r0, #0 ; THUMBV7-NEXT: strd r4, r7, [sp] ; THUMBV7-NEXT: mov r1, r3 ; THUMBV7-NEXT: strd r0, r0, [sp, #8] @@ -20,8 +20,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV7-NEXT: movs r2, #0 ; THUMBV7-NEXT: movs r3, #0 ; THUMBV7-NEXT: bl __multi3 -; THUMBV7-NEXT: strd r1, r0, [sp, #32] -; THUMBV7-NEXT: strd r3, r2, [sp, #24] +; THUMBV7-NEXT: strd r1, r0, [sp, #32] @ 8-byte Folded Spill +; THUMBV7-NEXT: strd r3, r2, [sp, #24] @ 8-byte Folded Spill ; THUMBV7-NEXT: ldrd r2, r0, [sp, #96] ; THUMBV7-NEXT: ldr.w r9, [sp, #80] ; THUMBV7-NEXT: umull lr, r0, r0, r6 @@ -47,7 +47,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV7-NEXT: adds r3, r3, r6 ; THUMBV7-NEXT: ldr r6, [sp, #24] @ 4-byte Reload ; THUMBV7-NEXT: adcs r2, r6 -; THUMBV7-NEXT: ldrd r6, lr, [sp, #36] +; THUMBV7-NEXT: ldrd r6, lr, [sp, #36] @ 8-byte Folded Reload ; THUMBV7-NEXT: str.w r6, [lr] ; THUMBV7-NEXT: adc r8, r8, #0 ; THUMBV7-NEXT: ldr r6, [sp, #32] @ 4-byte Reload diff --git a/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/test/CodeGen/X86/2006-11-17-IllegalMove.ll index 8b71633edb8c..e0b14f2a6b4c 100644 --- a/test/CodeGen/X86/2006-11-17-IllegalMove.ll +++ b/test/CodeGen/X86/2006-11-17-IllegalMove.ll @@ -10,11 +10,9 @@ define void @handle_vector_size_attribute() nounwind { ; CHECK-NEXT: ja .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb77 ; CHECK-NEXT: movb 0, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: # kill: def $eax killed $eax def $ax -; CHECK-NEXT: divb 0 -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: cmpq %rax, %rax +; CHECK-NEXT: movb 0, %al +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: .LBB0_2: # %bb84 ; CHECK-NEXT: retq entry: diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 3d194477f9a3..3acba30bfd33 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -1844,70 +1844,70 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: andq $-128, %rsp ; KNL-NEXT: subq $256, %rsp ## imm = 0x100 ; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1 ; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 ; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: vmovd %edi, %xmm2 ; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 @@ -1952,7 +1952,6 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: cmpb $0, 736(%rbp) -; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm2, (%rsp) diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index 6064917c7a26..11c531f9b775 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -1252,106 +1252,6 @@ define void @clamp(i32 %src, i16* %dst) { ret void } -define void @test19() { -; This is a massive reduction of an llvm-stress test case that generates -; interesting chains feeding setcc and eventually a f32 select operation. This -; is intended to exercise the SELECT formation in the DAG combine simplifying -; a simplified select_cc node. If it it regresses and is no longer triggering -; that code path, it can be deleted. -; -; CHECK-LABEL: test19: -; CHECK: ## %bb.0: ## %BB -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: movb $1, %cl -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB23_1: ## %CF -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %cl, %cl -; CHECK-NEXT: jne LBB23_1 -; CHECK-NEXT: ## %bb.2: ## %CF250 -; CHECK-NEXT: ## in Loop: Header=BB23_1 Depth=1 -; CHECK-NEXT: jne LBB23_1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB23_3: ## %CF242 -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cmpl %eax, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp LBB23_3 -; CHECK-NEXT: ## %bb.4: ## %CF244 -; CHECK-NEXT: retq -; -; ATHLON-LABEL: test19: -; ATHLON: ## %bb.0: ## %BB -; ATHLON-NEXT: movb $1, %al -; ATHLON-NEXT: .p2align 4, 0x90 -; ATHLON-NEXT: LBB23_1: ## %CF -; ATHLON-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATHLON-NEXT: testb %al, %al -; ATHLON-NEXT: jne LBB23_1 -; ATHLON-NEXT: ## %bb.2: ## %CF250 -; ATHLON-NEXT: ## in Loop: Header=BB23_1 Depth=1 -; ATHLON-NEXT: jne LBB23_1 -; ATHLON-NEXT: ## %bb.3: ## %CF242.preheader -; ATHLON-NEXT: fldz -; ATHLON-NEXT: .p2align 4, 0x90 -; ATHLON-NEXT: LBB23_4: ## %CF242 -; ATHLON-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATHLON-NEXT: fucomi %st(0), %st -; ATHLON-NEXT: jp LBB23_4 -; ATHLON-NEXT: ## %bb.5: ## %CF244 -; ATHLON-NEXT: fstp %st(0) -; ATHLON-NEXT: retl -; -; MCU-LABEL: test19: -; MCU: # %bb.0: # %BB -; MCU-NEXT: movl $-1, %ecx -; MCU-NEXT: movb $1, %al -; MCU-NEXT: .p2align 4, 0x90 -; MCU-NEXT: .LBB23_1: # %CF -; MCU-NEXT: # =>This Inner Loop Header: Depth=1 -; MCU-NEXT: testb %al, %al -; MCU-NEXT: jne .LBB23_1 -; MCU-NEXT: # %bb.2: # %CF250 -; MCU-NEXT: # in Loop: Header=BB23_1 Depth=1 -; MCU-NEXT: jne .LBB23_1 -; MCU-NEXT: # %bb.3: # %CF242.preheader -; MCU-NEXT: fldz -; MCU-NEXT: .p2align 4, 0x90 -; MCU-NEXT: .LBB23_4: # %CF242 -; MCU-NEXT: # =>This Inner Loop Header: Depth=1 -; MCU-NEXT: cmpl %eax, %ecx -; MCU-NEXT: fucom %st(0) -; MCU-NEXT: fnstsw %ax -; MCU-NEXT: # kill: def $ah killed $ah killed $ax -; MCU-NEXT: sahf -; MCU-NEXT: jp .LBB23_4 -; MCU-NEXT: # %bb.5: # %CF244 -; MCU-NEXT: fstp %st(0) -; MCU-NEXT: retl -BB: - br label %CF - -CF: - %Cmp10 = icmp ule i8 undef, undef - br i1 %Cmp10, label %CF, label %CF250 - -CF250: - %E12 = extractelement <4 x i32> , i32 2 - %Cmp32 = icmp ugt i1 %Cmp10, false - br i1 %Cmp32, label %CF, label %CF242 - -CF242: - %Cmp38 = icmp uge i32 %E12, undef - %FC = uitofp i1 %Cmp38 to float - %Sl59 = select i1 %Cmp32, float %FC, float undef - %Cmp60 = fcmp ugt float undef, undef - br i1 %Cmp60, label %CF242, label %CF244 - -CF244: - %B122 = fadd float %Sl59, undef - ret void -} - define i16 @select_xor_1(i16 %A, i8 %cond) { ; CHECK-LABEL: select_xor_1: ; CHECK: ## %bb.0: ## %entry @@ -1413,10 +1313,10 @@ define i16 @select_xor_1b(i16 %A, i8 %cond) { ; MCU-LABEL: select_xor_1b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %dl -; MCU-NEXT: je .LBB25_2 +; MCU-NEXT: je .LBB24_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: xorl $43, %eax -; MCU-NEXT: .LBB25_2: # %entry +; MCU-NEXT: .LBB24_2: # %entry ; MCU-NEXT: # kill: def $ax killed $ax killed $eax ; MCU-NEXT: retl entry: @@ -1483,10 +1383,10 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) { ; MCU-LABEL: select_xor_2b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB27_2 +; MCU-NEXT: je .LBB26_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: xorl %edx, %eax -; MCU-NEXT: .LBB27_2: # %entry +; MCU-NEXT: .LBB26_2: # %entry ; MCU-NEXT: retl entry: %and = and i8 %cond, 1 @@ -1552,10 +1452,10 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) { ; MCU-LABEL: select_or_b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB29_2 +; MCU-NEXT: je .LBB28_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: orl %edx, %eax -; MCU-NEXT: .LBB29_2: # %entry +; MCU-NEXT: .LBB28_2: # %entry ; MCU-NEXT: retl entry: %and = and i8 %cond, 1 @@ -1621,10 +1521,10 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) { ; MCU-LABEL: select_or_1b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB31_2 +; MCU-NEXT: je .LBB30_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: orl %edx, %eax -; MCU-NEXT: .LBB31_2: # %entry +; MCU-NEXT: .LBB30_2: # %entry ; MCU-NEXT: retl entry: %and = and i32 %cond, 1 diff --git a/test/CodeGen/X86/undef-ops.ll b/test/CodeGen/X86/undef-ops.ll index 257238fe2497..2cc9212d83c4 100644 --- a/test/CodeGen/X86/undef-ops.ll +++ b/test/CodeGen/X86/undef-ops.ll @@ -450,8 +450,6 @@ define <4 x i32> @xor_undef_lhs_vec(<4 x i32> %x) { define i1 @undef_operand_size_not_same_as_result() { ; CHECK-LABEL: undef_operand_size_not_same_as_result: ; CHECK: # %bb.0: -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %sh = shl i32 7, undef %cmp = icmp eq i32 0, %sh diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 4364d3d9363d..b8b17c49e803 100644 --- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -683,16 +683,12 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { ; CHECK-SSE-LABEL: test_urem_div_undef: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 -; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: test_urem_div_undef: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -794,16 +790,12 @@ define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { ; CHECK-SSE-LABEL: test_urem_both_undef: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 -; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: test_urem_both_undef: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll index 9e6b22340c4f..8b947b7f475a 100644 --- a/test/CodeGen/X86/vector-compare-all_of.ll +++ b/test/CodeGen/X86/vector-compare-all_of.ll @@ -917,3 +917,707 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { %11 = extractelement <32 x i8> %10, i32 0 ret i8 %11 } + +define i1 @bool_reduction_v2f64(<2 x double> %x, <2 x double> %y) { +; SSE-LABEL: bool_reduction_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: cmpltpd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = fcmp ogt <2 x double> %x, %y + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + %c = and <2 x i1> %a, %b + %d = extractelement <2 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4f32(<4 x float> %x, <4 x float> %y) { +; SSE-LABEL: bool_reduction_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: cmpeqps %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = fcmp oeq <4 x float> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = and <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = and <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4f64(<4 x double> %x, <4 x double> %y) { +; SSE-LABEL: bool_reduction_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: cmplepd %xmm1, %xmm3 +; SSE-NEXT: cmplepd %xmm0, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmplepd %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = fcmp oge <4 x double> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = and <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = and <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8f32(<8 x float> %x, <8 x float> %y) { +; SSE-LABEL: bool_reduction_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: cmpneqps %xmm3, %xmm1 +; SSE-NEXT: cmpneqps %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcmpneqps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = fcmp une <8 x float> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = and <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = and <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = and <8 x i1> %s2, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v2i64(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: bool_reduction_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp ugt <2 x i64> %x, %y + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + %c = and <2 x i1> %a, %b + %d = extractelement <2 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: bool_reduction_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp ne <4 x i32> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = and <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = and <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8i16(<8 x i16> %x, <8 x i16> %y) { +; SSE-LABEL: bool_reduction_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtw %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp slt <8 x i16> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = and <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = and <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = and <8 x i1> %s3, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v16i8(<16 x i8> %x, <16 x i8> %y) { +; SSE-LABEL: bool_reduction_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k1 +; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp sgt <16 x i8> %x, %y + %s1 = shufflevector <16 x i1> %a, <16 x i1> undef, <16 x i32> + %b = and <16 x i1> %s1, %a + %s2 = shufflevector <16 x i1> %b, <16 x i1> undef, <16 x i32> + %c = and <16 x i1> %s2, %b + %s3 = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %d = and <16 x i1> %s3, %c + %s4 = shufflevector <16 x i1> %d, <16 x i1> undef, <16 x i32> + %e = and <16 x i1> %s4, %d + %f = extractelement <16 x i1> %e, i32 0 + ret i1 %f +} + +define i1 @bool_reduction_v4i64(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: bool_reduction_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtq %xmm1, %xmm3 +; SSE-NEXT: pcmpgtq %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp slt <4 x i64> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = and <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = and <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) { +; SSE-LABEL: bool_reduction_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pminud %xmm1, %xmm3 +; SSE-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE-NEXT: pminud %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpleud %ymm1, %ymm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k1 {%k1} +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp ule <8 x i32> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = and <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = and <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = and <8 x i1> %s3, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v16i16(<16 x i16> %x, <16 x i16> %y) { +; SSE-LABEL: bool_reduction_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm3, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovnel %ecx, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k1 +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: kandw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp eq <16 x i16> %x, %y + %s1 = shufflevector <16 x i1> %a, <16 x i1> undef, <16 x i32> + %b = and <16 x i1> %s1, %a + %s2 = shufflevector <16 x i1> %b, <16 x i1> undef, <16 x i32> + %c = and <16 x i1> %s2, %b + %s3 = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %d = and <16 x i1> %s3, %c + %s4 = shufflevector <16 x i1> %d, <16 x i1> undef, <16 x i32> + %e = and <16 x i1> %s4, %d + %f = extractelement <16 x i1> %e, i32 0 + ret i1 %f +} + +define i1 @bool_reduction_v32i8(<32 x i8> %x, <32 x i8> %y) { +; SSE-LABEL: bool_reduction_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: movl $-1, %eax +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: cmpl $-1, %ecx +; AVX2-NEXT: cmovel %ecx, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k1 +; AVX512-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; AVX512-NEXT: kshiftrd $8, %k0, %k1 +; AVX512-NEXT: kandd %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $4, %k0, %k1 +; AVX512-NEXT: kandd %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $2, %k0, %k1 +; AVX512-NEXT: kandd %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $1, %k0, %k1 +; AVX512-NEXT: kandd %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp eq <32 x i8> %x, %y + %s1 = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> + %b = and <32 x i1> %s1, %a + %s2 = shufflevector <32 x i1> %b, <32 x i1> undef, <32 x i32> + %c = and <32 x i1> %s2, %b + %s3 = shufflevector <32 x i1> %c, <32 x i1> undef, <32 x i32> + %d = and <32 x i1> %s3, %c + %s4 = shufflevector <32 x i1> %d, <32 x i1> undef, <32 x i32> + %e = and <32 x i1> %s4, %d + %s5 = shufflevector <32 x i1> %e, <32 x i1> undef, <32 x i32> + %f = and <32 x i1> %s5, %e + %g = extractelement <32 x i1> %f, i32 0 + ret i1 %g +} diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll index 1eb6e9edf136..65c3052b79eb 100644 --- a/test/CodeGen/X86/vector-compare-any_of.ll +++ b/test/CodeGen/X86/vector-compare-any_of.ll @@ -891,3 +891,719 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { %11 = extractelement <32 x i8> %10, i32 0 ret i8 %11 } + +define i1 @bool_reduction_v2f64(<2 x double> %x, <2 x double> %y) { +; SSE-LABEL: bool_reduction_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: cmpltpd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = fcmp ogt <2 x double> %x, %y + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + %c = or <2 x i1> %a, %b + %d = extractelement <2 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4f32(<4 x float> %x, <4 x float> %y) { +; SSE-LABEL: bool_reduction_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: cmpeqps %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = fcmp oeq <4 x float> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = or <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = or <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4f64(<4 x double> %x, <4 x double> %y) { +; SSE-LABEL: bool_reduction_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: cmplepd %xmm1, %xmm3 +; SSE-NEXT: cmplepd %xmm0, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmplepd %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = fcmp oge <4 x double> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = or <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = or <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8f32(<8 x float> %x, <8 x float> %y) { +; SSE-LABEL: bool_reduction_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: cmpneqps %xmm3, %xmm1 +; SSE-NEXT: cmpneqps %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcmpneqps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = fcmp une <8 x float> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = or <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = or <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = or <8 x i1> %s2, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v2i64(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: bool_reduction_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp ugt <2 x i64> %x, %y + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + %c = or <2 x i1> %a, %b + %d = extractelement <2 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: bool_reduction_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp ne <4 x i32> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = or <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = or <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8i16(<8 x i16> %x, <8 x i16> %y) { +; SSE-LABEL: bool_reduction_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtw %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp slt <8 x i16> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = or <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = or <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = or <8 x i1> %s3, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v16i8(<16 x i8> %x, <16 x i8> %y) { +; SSE-LABEL: bool_reduction_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl %ecx, %eax +; SSE-NEXT: sbbl %eax, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: bool_reduction_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: cmpl %ecx, %eax +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %a = icmp sgt <16 x i8> %x, %y + %s1 = shufflevector <16 x i1> %a, <16 x i1> undef, <16 x i32> + %b = or <16 x i1> %s1, %a + %s2 = shufflevector <16 x i1> %b, <16 x i1> undef, <16 x i32> + %c = or <16 x i1> %s2, %b + %s3 = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %d = or <16 x i1> %s3, %c + %s4 = shufflevector <16 x i1> %d, <16 x i1> undef, <16 x i32> + %e = or <16 x i1> %s4, %d + %f = extractelement <16 x i1> %e, i32 0 + ret i1 %f +} + +define i1 @bool_reduction_v4i64(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: bool_reduction_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: pcmpgtq %xmm1, %xmm3 +; SSE-NEXT: pcmpgtq %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp slt <4 x i64> %x, %y + %s1 = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + %b = or <4 x i1> %s1, %a + %s2 = shufflevector <4 x i1> %b, <4 x i1> undef, <4 x i32> + %c = or <4 x i1> %s2, %b + %d = extractelement <4 x i1> %c, i32 0 + ret i1 %d +} + +define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) { +; SSE-LABEL: bool_reduction_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pminud %xmm1, %xmm3 +; SSE-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE-NEXT: pminud %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpleud %ymm1, %ymm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp ule <8 x i32> %x, %y + %s1 = shufflevector <8 x i1> %a, <8 x i1> undef, <8 x i32> + %b = or <8 x i1> %s1, %a + %s2 = shufflevector <8 x i1> %b, <8 x i1> undef, <8 x i32> + %c = or <8 x i1> %s2, %b + %s3 = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %d = or <8 x i1> %s3, %c + %e = extractelement <8 x i1> %d, i32 0 + ret i1 %e +} + +define i1 @bool_reduction_v16i16(<16 x i16> %x, <16 x i16> %y) { +; SSE-LABEL: bool_reduction_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm3, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl %ecx, %eax +; SSE-NEXT: sbbl %eax, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: cmpl %ecx, %eax +; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: cmpl %ecx, %eax +; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; AVX512-NEXT: kshiftrw $8, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp eq <16 x i16> %x, %y + %s1 = shufflevector <16 x i1> %a, <16 x i1> undef, <16 x i32> + %b = or <16 x i1> %s1, %a + %s2 = shufflevector <16 x i1> %b, <16 x i1> undef, <16 x i32> + %c = or <16 x i1> %s2, %b + %s3 = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %d = or <16 x i1> %s3, %c + %s4 = shufflevector <16 x i1> %d, <16 x i1> undef, <16 x i32> + %e = or <16 x i1> %s4, %d + %f = extractelement <16 x i1> %e, i32 0 + ret i1 %f +} + +define i1 @bool_reduction_v32i8(<32 x i8> %x, <32 x i8> %y) { +; SSE-LABEL: bool_reduction_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl %ecx, %eax +; SSE-NEXT: sbbl %eax, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: bool_reduction_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: cmpl %ecx, %eax +; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: bool_reduction_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; AVX512-NEXT: kshiftrd $16, %k0, %k1 +; AVX512-NEXT: kord %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $8, %k0, %k1 +; AVX512-NEXT: kord %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $4, %k0, %k1 +; AVX512-NEXT: kord %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $2, %k0, %k1 +; AVX512-NEXT: kord %k0, %k1, %k0 +; AVX512-NEXT: kshiftrd $1, %k0, %k1 +; AVX512-NEXT: kord %k0, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp eq <32 x i8> %x, %y + %s1 = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> + %b = or <32 x i1> %s1, %a + %s2 = shufflevector <32 x i1> %b, <32 x i1> undef, <32 x i32> + %c = or <32 x i1> %s2, %b + %s3 = shufflevector <32 x i1> %c, <32 x i1> undef, <32 x i32> + %d = or <32 x i1> %s3, %c + %s4 = shufflevector <32 x i1> %d, <32 x i1> undef, <32 x i32> + %e = or <32 x i1> %s4, %d + %s5 = shufflevector <32 x i1> %e, <32 x i1> undef, <32 x i32> + %f = or <32 x i1> %s5, %e + %g = extractelement <32 x i1> %f, i32 0 + ret i1 %g +} diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll index e910c9c74dac..cd7902ea98d2 100644 --- a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1806,44 +1806,20 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,1,2,3,u,u,u,u> -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = -; SSE41-NEXT: pmulhw %xmm1, %xmm3 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm2 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [844433520132096,844433520132096] -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 -; AVX2-NEXT: vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i16: ; XOP: # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll index 7a59f80f55e7..79f8cc6a05fe 100644 --- a/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll +++ b/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll @@ -1500,32 +1500,25 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i16: @@ -1535,10 +1528,8 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; AVX512DQ-LABEL: constant_shift_v4i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX512DQ-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v4i16: @@ -1552,10 +1543,8 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; AVX512DQVL-LABEL: constant_shift_v4i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v4i16: @@ -1565,12 +1554,11 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; X32-SSE-LABEL: constant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = -; X32-SSE-NEXT: pmulhuw %xmm0, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqw {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: por %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i16> %a, diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index c21fc6a73a88..a11c9f6cb5d3 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -2614,8 +2614,64 @@ define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) { ret <4 x i64> %ext } -define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) { -; SSE2-LABEL: splatshuf_zext_v8i32: +define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) { +; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq + %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> + %ext = zext <8 x i16> %shuf to <8 x i32> + ret <8 x i32> %ext +} + +define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) { +; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,5,7] @@ -2626,7 +2682,7 @@ define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) { ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; -; SSSE3-LABEL: splatshuf_zext_v8i32: +; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] @@ -2636,7 +2692,7 @@ define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) { ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; -; SSE41-LABEL: splatshuf_zext_v8i32: +; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] @@ -2645,7 +2701,7 @@ define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) { ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: splatshuf_zext_v8i32: +; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2654,13 +2710,13 @@ define <8 x i32> @splatshuf_zext_v8i32(<8 x i16> %x) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: splatshuf_zext_v8i32: +; AVX2-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatshuf_zext_v8i32: +; AVX512-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero diff --git a/test/MC/AsmParser/unfinished-op.s b/test/MC/AsmParser/unfinished-op.s new file mode 100644 index 000000000000..079e036b160d --- /dev/null +++ b/test/MC/AsmParser/unfinished-op.s @@ -0,0 +1,4 @@ +# RUN: not llvm-mc -triple i386-unknown-unknown %s 2>&1 > /dev/null| FileCheck %s --check-prefix=CHECK-ERROR + +#CHECK-ERROR: error: instruction must have size higher than 0 + .byte 64;"" diff --git a/test/MC/X86/x86_operands.s b/test/MC/X86/x86_operands.s index 3aa1b8d7fb8f..70b11faf003e 100644 --- a/test/MC/X86/x86_operands.s +++ b/test/MC/X86/x86_operands.s @@ -9,7 +9,7 @@ addl $a, %eax # CHECK: addl $3, %eax addl $1 + 2, %eax - + # Disambiguation # CHECK: addl $1, 8 @@ -26,7 +26,7 @@ addl $1, (%eax) # CHECK: addl $1, 8(,%eax) addl $1, (4+4)(,%eax) - + # Indirect Memory Operands # CHECK: addl $1, 1(%eax) addl $1, 1(%eax) diff --git a/test/Transforms/SLPVectorizer/X86/PR39774.ll b/test/Transforms/SLPVectorizer/X86/PR39774.ll index ae4a6b88bd31..24f75b32c5d2 100644 --- a/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -81,9 +81,9 @@ define void @Test(i32) { ; CHECK-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] ; CHECK-NEXT: [[VAL_42:%.*]] = and i32 [[VAL_40]], undef ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32>