diff --git a/CMakeLists.txt b/CMakeLists.txt index ec2744d..e8a8bc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,61 +1,75 @@ cmake_minimum_required(VERSION 3.21) project(InstrumentAMDGPUKernel LANGUAGES C CXX) - -set(LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory") - -set(LLVM_INCLUDE_DIR "${LLVM_INSTALL_DIR}/include/llvm") -if(NOT EXISTS "${LLVM_INCLUDE_DIR}") -message(FATAL_ERROR - " LLVM_INSTALL_DIR (${LLVM_INCLUDE_DIR}) is invalid.") -endif() - -# Check that the LLVMConfig.cmake file exists -set(- FALSE) - -# Ubuntu + Darwin -if(EXISTS "${LLVM_INSTALL_DIR}/lib/cmake/llvm/LLVMConfig.cmake") - set(VALID_INSTALLATION TRUE) +if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH}) +else() + set(ROCM_PATH "/opt/rocm") endif() +message("ROCM_PATH: ${ROCM_PATH}") -# Fedora -if(EXISTS "${LLVM_INSTALL_DIR}/lib64/cmake/llvm/LLVMConfig.cmake") - set(VALID_INSTALLATION TRUE) -endif() +# Define LLVM versions +set(TRITON_LLVM "" CACHE PATH "Path to the Triton LLVM directory") +set(ROCM_LLVM "${ROCM_PATH}/llvm" CACHE PATH "Path to the ROCm LLVM directory") -if(NOT ${VALID_INSTALLATION}) +# Function to find and configure LLVM +function(find_and_configure_llvm llvm_install_dir llvm_version) + # Confirm that the LLVM installation directory exists + set(LLVM_INCLUDE_DIR "${llvm_install_dir}/include/llvm") + if(NOT EXISTS "${LLVM_INCLUDE_DIR}") message(FATAL_ERROR - "LLVM installation directory, (${LLVM_INSTALL_DIR}), is invalid. Couldn't + " llvm_install_dir (${LLVM_INCLUDE_DIR}) is invalid.") + endif() + + # Check that the LLVMConfig.cmake file exists + set(- FALSE) + + # Ubuntu + Darwin + if(EXISTS "${llvm_install_dir}/lib/cmake/llvm/LLVMConfig.cmake") + set(VALID_INSTALLATION TRUE) + endif() + + # Fedora + if(EXISTS "${llvm_install_dir}/lib64/cmake/llvm/LLVMConfig.cmake") + set(VALID_INSTALLATION TRUE) + endif() + + if(NOT ${VALID_INSTALLATION}) + message(FATAL_ERROR + "LLVM installation directory, (${llvm_install_dir}), is invalid. Couldn't find LLVMConfig.cmake.") -endif() + endif() -list(APPEND CMAKE_PREFIX_PATH "${LLVM_INSTALL_DIR}/lib/cmake/llvm/") + include("${llvm_install_dir}/lib/cmake/llvm/LLVMConfig.cmake") + #include("${llvm_install_dir}/lib/cmake/llvm/LLVM-Config.cmake") -find_package(LLVM CONFIG) + if("${LLVM_VERSION_MAJOR}" VERSION_LESS 17) + message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 17 or above") + endif() -if("${LLVM_VERSION_MAJOR}" VERSION_LESS 17) - message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 17 or above") -endif() + message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION} in ${llvm_install_dir}") + message(STATUS "Using LLVMConfig.cmake in: ${llvm_install_dir}") -message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") -message(STATUS "Using LLVMConfig.cmake in: ${LLVM_INSTALL_DIR}") + message("${llvm_version} LLVM STATUS: + Definitions ${LLVM_DEFINITIONS} + Includes ${LLVM_INCLUDE_DIRS} + Libraries ${LLVM_LIBRARY_DIRS} + Targets ${LLVM_TARGETS_TO_BUILD}" + ) -message("LLVM STATUS: - Definitions ${LLVM_DEFINITIONS} - Includes ${LLVM_INCLUDE_DIRS} - Libraries ${LLVM_LIBRARY_DIRS} - Targets ${LLVM_TARGETS_TO_BUILD}" -) + set(LLVM_INCLUDE_DIRS_${llvm_version} ${LLVM_INCLUDE_DIRS} PARENT_SCOPE) + set(LLVM_LIBRARY_DIRS_${llvm_version} ${LLVM_LIBRARY_DIRS} PARENT_SCOPE) + set(LLVM_DEFINITIONS_${llvm_version} ${LLVM_DEFINITIONS} PARENT_SCOPE) -include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) -link_directories(${LLVM_LIBRARY_DIRS}) -add_definitions(${LLVM_DEFINITIONS}) +endfunction() + +find_and_configure_llvm(${ROCM_LLVM} "ROCM") +find_and_configure_llvm(${TRITON_LLVM} "TRITON") set(CMAKE_CXX_STANDARD 17 CACHE STRING "") if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Debug CACHE - STRING "Build type (default Debug):" FORCE) + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type (default Debug):" FORCE) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall \ @@ -78,10 +92,9 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib") # Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24: if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") - cmake_policy(SET CMP0135 NEW) + cmake_policy(SET CMP0135 NEW) endif() - # enable_testing() add_subdirectory(lib) @@ -107,4 +120,3 @@ endif() install(TARGETS ${AMDGCN_INSTRUMENTATION_PLUGINS} LIBRARY DESTINATION lib/${CMAKE_PROJECT_NAME} PUBLIC_HEADER DESTINATION include/${CMAKE_PROJECT_NAME}) - diff --git a/instrumentation/MemTraceInstrumentationKernel.cpp b/instrumentation/MemTraceInstrumentationKernel.cpp index 60d522b..4b2a04e 100644 --- a/instrumentation/MemTraceInstrumentationKernel.cpp +++ b/instrumentation/MemTraceInstrumentationKernel.cpp @@ -10,65 +10,66 @@ __device__ uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x __attribute__((always_inline)) __device__ uint32_t getWaveId() { - return getThreadIdInBlock() / WaveFrontSize; + return getThreadIdInBlock() / WaveFrontSize; } + __attribute__((always_inline)) - __device__ bool isSharedMemPtr(const void *Ptr) { - return __builtin_amdgcn_is_shared( - (const __attribute__((address_space(0))) void *)Ptr); +__device__ bool isSharedMemPtr(const void *Ptr) { + return __builtin_amdgcn_is_shared( + (const __attribute__((address_space(0))) void *)Ptr); } __attribute__((used)) -__device__ void memTrace(void* addressPtr, uint32_t LocationId){ - if(isSharedMemPtr(addressPtr)) - return; - uint64_t address = reinterpret_cast(addressPtr); - //Mask of the active threads in the wave - int activeMask = __builtin_amdgcn_read_exec(); -// //Find first active thread in the wave by finding the position of the least significant bit set to 1 in the activeMask - const int firstActiveLane = __ffs(activeMask) - 1; - uint64_t addrArray[WaveFrontSize]; - for(int i = 0; i < WaveFrontSize; i++){ - addrArray[i] = __shfl(address, i, WaveFrontSize); - } - uint32_t Lane = __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); - - if(Lane == firstActiveLane){ - unsigned int hw_id = 0; - uint64_t Time = 0; -#if !defined(__gfx1100__) && !defined(__gfx1101__) - Time = __builtin_amdgcn_s_memrealtime(); - asm volatile("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s"(hw_id)); -#endif - char hex_str[]= "0123456789abcdef"; - char out[WaveFrontSize*HexLen + 1]; - (out)[WaveFrontSize*HexLen] = '\0'; - for (size_t i = 0; i < WaveFrontSize; i++) { - (out)[i * HexLen + 0] = '0'; - (out)[i * HexLen + 1] = 'x'; - (out)[i * HexLen + 2] = hex_str[(addrArray[i] >> 44) & 0x0F]; - (out)[i * HexLen + 3] = hex_str[(addrArray[i] >> 40) & 0x0F]; - (out)[i * HexLen + 4] = hex_str[(addrArray[i] >> 36) & 0x0F]; - (out)[i * HexLen + 5] = hex_str[(addrArray[i] >> 32) & 0x0F]; - (out)[i * HexLen + 6] = hex_str[(addrArray[i] >> 28) & 0x0F]; - (out)[i * HexLen + 7] = hex_str[(addrArray[i] >> 24) & 0x0F]; - (out)[i * HexLen + 8] = hex_str[(addrArray[i] >> 20) & 0x0F]; - (out)[i * HexLen + 9] = hex_str[(addrArray[i] >> 16) & 0x0F]; - (out)[i * HexLen + 10] = hex_str[(addrArray[i] >> 12) & 0x0F]; - (out)[i * HexLen + 11] = hex_str[(addrArray[i] >> 8) & 0x0F]; - (out)[i * HexLen + 12] = hex_str[(addrArray[i] >> 4) & 0x0F]; - (out)[i * HexLen + 13] = hex_str[(addrArray[i] ) & 0x0F]; - (out)[i * HexLen + 14] = ','; +__device__ void memTrace(void* addressPtr, uint32_t LocationId) { + printf("Hello! You are inside memTrace()\n"); + if (isSharedMemPtr(addressPtr)) + return; + uint64_t address = reinterpret_cast(addressPtr); + // Mask of the active threads in the wave + int activeMask = __builtin_amdgcn_read_exec(); + // Find first active thread in the wave by finding the position of the least significant bit set to 1 in the activeMask + const int firstActiveLane = __ffs(activeMask) - 1; + uint64_t addrArray[WaveFrontSize]; + for (int i = 0; i < WaveFrontSize; i++) { + addrArray[i] = __shfl(address, i, WaveFrontSize); } - (out)[WaveFrontSize * HexLen - 1] = '\n'; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) - unsigned int xcc_id; - asm volatile("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s"(xcc_id)); - printf("%ld,%d,%d,%d,%d,%d,%d, %s", Time, LocationId, (hw_id & 0xf), ((hw_id & 0x30) >> 4), ((hw_id & 0xf00) >> 8), ((hw_id & 0xe000) >> 13), xcc_id, out); -#else - printf("%ld,%d,%d,%d,%d,%d,%s", Time, LocationId, (hw_id & 0xf), ((hw_id & 0x30) >> 4), ((hw_id & 0xf00) >> 8), ((hw_id & 0xe000) >> 13),out); -#endif + uint32_t Lane = __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); - } + if (Lane == firstActiveLane) { + unsigned int hw_id = 0; + uint64_t Time = 0; +#if !defined(__gfx1100__) && !defined(__gfx1101__) + Time = __builtin_amdgcn_s_memrealtime(); + asm volatile("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s"(hw_id)); +#endif + char hex_str[] = "0123456789abcdef"; + char out[WaveFrontSize * HexLen + 1]; + (out)[WaveFrontSize * HexLen] = '\0'; + for (size_t i = 0; i < WaveFrontSize; i++) { + (out)[i * HexLen + 0] = '0'; + (out)[i * HexLen + 1] = 'x'; + (out)[i * HexLen + 2] = hex_str[(addrArray[i] >> 44) & 0x0F]; + (out)[i * HexLen + 3] = hex_str[(addrArray[i] >> 40) & 0x0F]; + (out)[i * HexLen + 4] = hex_str[(addrArray[i] >> 36) & 0x0F]; + (out)[i * HexLen + 5] = hex_str[(addrArray[i] >> 32) & 0x0F]; + (out)[i * HexLen + 6] = hex_str[(addrArray[i] >> 28) & 0x0F]; + (out)[i * HexLen + 7] = hex_str[(addrArray[i] >> 24) & 0x0F]; + (out)[i * HexLen + 8] = hex_str[(addrArray[i] >> 20) & 0x0F]; + (out)[i * HexLen + 9] = hex_str[(addrArray[i] >> 16) & 0x0F]; + (out)[i * HexLen + 10] = hex_str[(addrArray[i] >> 12) & 0x0F]; + (out)[i * HexLen + 11] = hex_str[(addrArray[i] >> 8) & 0x0F]; + (out)[i * HexLen + 12] = hex_str[(addrArray[i] >> 4) & 0x0F]; + (out)[i * HexLen + 13] = hex_str[(addrArray[i]) & 0x0F]; + (out)[i * HexLen + 14] = ','; + } + (out)[WaveFrontSize * HexLen - 1] = '\n'; +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) + unsigned int xcc_id; + asm volatile("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s"(xcc_id)); + printf("%ld,%d,%d,%d,%d,%d,%d, %s", Time, LocationId, (hw_id & 0xf), ((hw_id & 0x30) >> 4), ((hw_id & 0xf00) >> 8), ((hw_id & 0xe000) >> 13), xcc_id, out); +#else + printf("Hello!\n"); + printf("%ld,%d,%d,%d,%d,%d,%s", Time, LocationId, (hw_id & 0xf), ((hw_id & 0x30) >> 4), ((hw_id & 0xf00) >> 8), ((hw_id & 0xe000) >> 13), out); +#endif + } } - diff --git a/lib/AMDGCNMemTrace.cpp b/lib/AMDGCNMemTrace.cpp index 3590415..76e1388 100644 --- a/lib/AMDGCNMemTrace.cpp +++ b/lib/AMDGCNMemTrace.cpp @@ -62,11 +62,13 @@ void InjectingInstrumentationFunction(const BasicBlock::iterator &I, const Funct bool AMDGCNMemTrace::runOnModule(Module &M) { + printf("Hello from AMDGCNMemTrace.cpp, runModule()\n"); bool ModifiedCodeGen = false; auto &CTX = M.getContext(); uint32_t LocationCounter = 0; std::string errorMsg; std::unique_ptr InstrumentationModule; + printf("Calling loadInstrumentationFile() with file %s\n", InstrumentationFunctionFile.c_str()); if (!loadInstrumentationFile(InstrumentationFunctionFile, CTX, InstrumentationModule, errorMsg)) { printf("error loading program '%s': %s", InstrumentationFunctionFile.c_str(), errorMsg.c_str()); @@ -94,6 +96,7 @@ bool AMDGCNMemTrace::runOnModule(Module &M) { } PassPluginLibraryInfo getPassPluginInfo() { + printf("Hello from AMDGCNMemTrace.cpp, getPassPluginInfo()\n"); const auto callback = [](PassBuilder &PB) { PB.registerOptimizerLastEPCallback([&](ModulePassManager &MPM, auto&&... args) { MPM.addPass(AMDGCNMemTrace()); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 92e1eb2..6de7c58 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,10 +1,10 @@ set(AMDGCN_INSTRUMENTATION_PASSES InjectAMDGCNFunction - InjectAMDGCNInlineASM - InjectAMDGCNSharedMemTtrace - AMDGCNNumCacheLines - AMDGCNMemTrace - ) + InjectAMDGCNInlineASM + InjectAMDGCNSharedMemTtrace + AMDGCNNumCacheLines + AMDGCNMemTrace +) set(InjectAMDGCNFunction_SOURCES InjectAMDGCNFunction.cpp) @@ -24,29 +24,53 @@ set(AMDGCNMemTrace_SOURCES set(AMDGCN_INSTRUMENTATION_PLUGINS "") -foreach( plugin ${AMDGCN_INSTRUMENTATION_PASSES} ) - add_library( - ${plugin} - SHARED - ${${plugin}_SOURCES} - ) - - target_include_directories( - ${plugin} - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/../include" - ) - - target_link_libraries( - ${plugin} - PRIVATE - LLVMCore - LLVMIRReader - LLVMLinker - "$<$:-undefined dynamic_lookup>" +function(link_against_llvm llvm_version) + string(TOLOWER ${llvm_version} install_suffix) + foreach( plugin ${AMDGCN_INSTRUMENTATION_PASSES} ) + add_library( + ${plugin}-${install_suffix} + SHARED + ${${plugin}_SOURCES} + ) + + target_include_directories( + ${plugin}-${install_suffix} + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../include" + ${LLVM_INCLUDE_DIRS_${llvm_version}} ) - list(APPEND AMDGCN_INSTRUMENTATION_PLUGINS ${plugin}) -endforeach() + # target_link_directories( + # ${plugin}-${install_suffix} + # PRIVATE + # ${LLVM_LIBRARY_DIRS_${llvm_version}} + # ) + + # target_link_libraries( + # ${plugin}-${install_suffix} + # PRIVATE + # LLVMCore + # LLVMIRReader + # LLVMLinker + # "$<$:-undefined dynamic_lookup>" + # # ${LLVM_LIBRARY_DIRS_${llvm_version}}/libLLVMCore.a + # # ${LLVM_LIBRARY_DIRS_${llvm_version}}/libLLVMIRReader.a + # # ${LLVM_LIBRARY_DIRS_${llvm_version}}/libLLVMLinker.a + # # ${LLVM_LIBRARY_DIRS_${llvm_version}}/libLLVMSupport.a + # ) + + # set_target_properties( + # ${plugin}-${install_suffix} + # PROPERTIES + # LINK_SEARCH_START_STATIC ON + # LINK_SEARCH_END_STATIC ON + # ) + + list(APPEND AMDGCN_INSTRUMENTATION_PLUGINS ${plugin}-${install_suffix}) + endforeach() +endfunction() + +link_against_llvm("ROCM") +link_against_llvm("TRITON") set(AMDGCN_INSTRUMENTATION_PLUGINS "${AMDGCN_INSTRUMENTATION_PLUGINS}" PARENT_SCOPE)