diff --git a/.github/workflows/sst_integration.yml b/.github/workflows/sst_integration.yml new file mode 100644 index 000000000..03635db64 --- /dev/null +++ b/.github/workflows/sst_integration.yml @@ -0,0 +1,80 @@ +# Workflow with cmake build system +name: SST Integration Test + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the mydev branch + push: + branches-ignore: + - "gh-readonly-queue**" + pull_request: + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + build-QV100: + runs-on: ubuntu-latest + defaults: + run: + shell: bash + strategy: + matrix: + # test_type: [simple, medium, long] + test_type: [simple, medium] + container: + image: tgrogers/accel-sim_regress:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly + env: + CONFIG: QV100 + GPU_ARCH: sm_70 + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v4 + - name: Build GPGPU-Sim in SST mode + run: | + source ./setup_environment + make -j4 + - name: Prepare SST dependencies + run: | + apt install -y openmpi-bin openmpi-common libtool libtool-bin autoconf python3 python3-dev automake build-essential git + # Use personal repo for now + - name: Build SST-Core + run: | + git clone https://github.com/William-An/sst-core.git + cd sst-core + git pull + git checkout devel + ./autogen.sh + ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools + make -j4 + make install + cd .. + rm -rf ./sst-core + # Use personal repo for now + - name: Build SST-Elements + run: | + git clone https://github.com/William-An/sst-elements.git + source ./setup_environment + cd sst-elements + git pull + git checkout balar-mmio-vanadis-llvm + ./autogen.sh + ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT + make -j4 + make install + # Have to resource the gpu app + # Also fake a SDK since rodinia 2.0 does not need this, speed things up on github + - name: Balar Test + run: | + pip install testtools blessings pygments + source ./setup_environment + mkdir 4.2 + mkdir fake_sdk + export NVIDIA_COMPUTE_SDK_LOCATION=$(readlink -f ./fake_sdk) + source $GPUAPPS_ROOT/src/setup_environment sst + rm -rf 4.2 + rm -f gpucomputingsdk_4.2.9_linux.run + ./sstcore-install/bin/sst-test-elements -w "*balar*${{ matrix.test_type }}*" \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index f6676bf14..4ef467bae 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -87,6 +87,41 @@ pipeline { ssh tgrogers@dynamo.ecn.purdue.edu "cd $PLOTDIR && rm -rf latest && cp -r ${BUILD_NUMBER} latest"' } } + stage('sst-core-build') { + steps { + sh 'rm -rf sstcore-install' + sh 'rm -rf sst-core && git clone git@github.com:sstsimulator/sst-core.git' + sh '''#!/bin/bash + cd sst-core + ./autogen.sh + ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools + make -j 10 + make install''' + } + } + stage('sst-elements-build') { + steps { + sh 'rm -rf sstelements-install' + sh 'rm -rf sst-elements && git clone git@github.com:sstsimulator/sst-elements.git' + // First sourcing the env_setup and setup_environment script for env vars + sh '''#!/bin/bash + source ./env-setup/11.0_env_setup.sh + source `pwd`/setup_environment + cd sst-elements + ./autogen.sh + ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT + make -j 10 + make install''' + } + } + stage('sst balar test') { + steps { + sh '''#!/bin/bash + source ./env-setup/11.0_env_setup.sh + source `pwd`/setup_environment sst + ./sstcore-install/bin/sst-test-elements -p ./sst-elements/src/sst/elements/balar/tests''' + } + } } post { success { diff --git a/Makefile b/Makefile index 82ea39928..37dba0146 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ INTERSIM ?= intersim2 include version_detection.mk +# Check for debug ifeq ($(GPGPUSIM_CONFIG), gcc-$(CC_VERSION)/cuda-$(CUDART_VERSION)/debug) export DEBUG=1 else @@ -168,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi + if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib g++ -dynamiclib -Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1\ diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h index d292e224e..e620e5728 100644 --- a/libcuda/cuda_api_object.h +++ b/libcuda/cuda_api_object.h @@ -1,6 +1,7 @@ #ifndef __cuda_api_object_h__ #define __cuda_api_object_h__ +#include #include #include #include @@ -193,9 +194,25 @@ class cuda_runtime_api { // backward pointer class gpgpu_context *gpgpu_ctx; // member function list + + // For SST and other potential simulator interface + void cuobjdumpInit(const char *fn); + void extract_code_using_cuobjdump(const char *fn); + void extract_ptx_files_using_cuobjdump(CUctx_st *context, const char *fn); + + // For running GPGPUSim alone void cuobjdumpInit(); void extract_code_using_cuobjdump(); void extract_ptx_files_using_cuobjdump(CUctx_st *context); + + // Internal functions for the above public methods + void cuobjdumpInit_internal(std::function ctx_extract_code_func); + void extract_code_using_cuobjdump_internal( + CUctx_st *context, std::string &app_binary, + std::function ctx_extract_ptx_func); + void extract_ptx_files_using_cuobjdump_internal(CUctx_st *context, + std::string &app_binary); + std::list pruneSectionList(CUctx_st *context); std::list mergeMatchingSections(std::string identifier); std::list mergeSections(); diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc index b540ffd91..5dfd3fc38 100644 --- a/libcuda/cuda_runtime_api.cc +++ b/libcuda/cuda_runtime_api.cc @@ -109,6 +109,7 @@ #include #include #include +#include #include #include #include @@ -151,6 +152,9 @@ #include #endif +// SST cycle +extern bool SST_Cycle(); + /*DEVICE_BUILTIN*/ struct cudaArray { void *devPtr; @@ -412,6 +416,13 @@ void setCuobjdumpsassfilename( //! processes (e.g. cuobjdump) reading /proc//exe will see the emulator //! executable instead of the application binary. //! +// In SST need the string to pass the binary information +// as we cannot get it from /proc/self/exe +std::string get_app_binary(const char *fn) { + printf("self exe links to: %s\n", fn); + return fn; +} + std::string get_app_binary() { char self_exe_path[1025]; #ifdef __APPLE__ @@ -453,19 +464,25 @@ char *get_app_binary_name(std::string abs_path) { return self_exe_path; } -static int get_app_cuda_version() { +static int get_app_cuda_version_internal(std::string app_binary) { int app_cuda_version = 0; char fname[1024]; snprintf(fname, 1024, "_app_cuda_version_XXXXXX"); int fd = mkstemp(fname); close(fd); + // Weili: Add way to extract CUDA version information from Balar Vanadis + // binary (stored as a const string) std::string app_cuda_version_command = - "ldd " + get_app_binary() + + "ldd " + app_binary + " | grep libcudart.so | sed 's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " + + fname + " && strings " + app_binary + + " | grep libcudart_vanadis.a | sed " + "'s/.*libcudart_vanadis.a.\\(.*\\)/\\1/' >> " + fname; int res = system(app_cuda_version_command.c_str()); if (res == -1) { - printf("Error - Cannot detect the app's CUDA version.\n"); + printf("Error - Cannot detect the app's CUDA version. Command: %s\n", + app_cuda_version_command.c_str()); exit(1); } FILE *cmd = fopen(fname, "r"); @@ -476,12 +493,24 @@ static int get_app_cuda_version() { } fclose(cmd); if (app_cuda_version == 0) { - printf("Error - Cannot detect the app's CUDA version.\n"); + printf("Error - Cannot detect the app's CUDA version. Command: %s\n", + app_cuda_version_command.c_str()); exit(1); } return app_cuda_version; } +static int get_app_cuda_version(const char *fn) { + // Use for other simulator integration + std::string app_binary = get_app_binary(fn); + return get_app_cuda_version_internal(app_binary); +} + +static int get_app_cuda_version() { + std::string app_binary = get_app_binary(); + return get_app_cuda_version_internal(app_binary); +} + //! Keep track of the association between filename and cubin handle void cuda_runtime_api::cuobjdumpRegisterFatBinary(unsigned int handle, const char *filename, @@ -574,8 +603,11 @@ __host__ cudaError_t CUDARTAPI cudaDeviceGetLimitInternal( return g_last_cudaError = cudaSuccess; } -void **cudaRegisterFatBinaryInternal(void *fatCubin, - gpgpu_context *gpgpu_ctx = NULL) { +// Internal implementation for cudaRegisterFatBiaryInternal +void **cudaRegisterFatBiaryInternal_impl( + void *fatCubin, gpgpu_context *gpgpu_ctx, std::string &app_binary_path, + int app_cuda_version, + std::function ctx_cuobjdumpInit_func) { gpgpu_context *ctx; if (gpgpu_ctx) { ctx = gpgpu_ctx; @@ -606,11 +638,9 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin, // compiled with a newer version of CUDA to run apps compiled with older // versions of CUDA. This is especially useful for PTXPLUS execution. // Skip cuda version check for pytorch application - std::string app_binary_path = get_app_binary(); int pos = app_binary_path.find("python"); if (pos == std::string::npos) { // Not pytorch app : checking cuda version - int app_cuda_version = get_app_cuda_version(); assert( app_cuda_version == CUDART_VERSION / 1000 && "The app must be compiled with same major version as the simulator."); @@ -661,7 +691,7 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin, * then for next calls, only returns the appropriate number */ assert(fat_cubin_handle >= 1); - if (fat_cubin_handle == 1) ctx->api->cuobjdumpInit(); + if (fat_cubin_handle == 1) ctx_cuobjdumpInit_func(ctx); ctx->api->cuobjdumpRegisterFatBinary(fat_cubin_handle, filename, context); return (void **)fat_cubin_handle; @@ -753,6 +783,28 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin, #endif } +void **cudaRegisterFatBinaryInternal(const char *fn, void *fatCubin, + gpgpu_context *gpgpu_ctx = NULL) { + std::string app_binary_path = get_app_binary(fn); + int app_cuda_version = get_app_cuda_version(fn); + auto ctx_cuobjdumpInit = [=](gpgpu_context *ctx) { + ctx->api->cuobjdumpInit(fn); + }; + return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path, + app_cuda_version, ctx_cuobjdumpInit); +} + +void **cudaRegisterFatBinaryInternal(void *fatCubin, + gpgpu_context *gpgpu_ctx = NULL) { + std::string app_binary_path = get_app_binary(); + int app_cuda_version = get_app_cuda_version(); + auto ctx_cuobjdumpInit = [](gpgpu_context *ctx) { + ctx->api->cuobjdumpInit(); + }; + return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path, + app_cuda_version, ctx_cuobjdumpInit); +} + void cudaRegisterFunctionInternal(void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid, @@ -1057,6 +1109,24 @@ cudaError_t cudaMallocHostInternal(void **ptr, size_t size, } } +// SST malloc done by vanadis, we just need to record the memory addr +cudaError_t CUDARTAPI cudaMallocHostSSTInternal( + void *addr, size_t size, gpgpu_context *gpgpu_ctx = NULL) { + gpgpu_context *ctx; + if (gpgpu_ctx) { + ctx = gpgpu_ctx; + } else { + ctx = GPGPU_Context(); + } + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + // track pinned memory size allocated in the host so that same amount of + // memory is also allocated in GPU. + ctx->api->pinned_memory_size[addr] = size; + return g_last_cudaError = cudaSuccess; +} + __host__ cudaError_t CUDARTAPI cudaMallocPitchInternal(void **devPtr, size_t *pitch, size_t width, size_t height, gpgpu_context *gpgpu_ctx = NULL) { @@ -2301,13 +2371,77 @@ cudaDeviceSynchronizeInternal(gpgpu_context *gpgpu_ctx = NULL) { * * *******************************************************************************/ -extern "C" { - /******************************************************************************* * * - * * + * SST Specific functions, used by Balar * * * *******************************************************************************/ + +/** + * @brief Custom function to get CUDA function parameter size and offset + * from PTX parsing result + * + * @param hostFun + * @param index + * @return std::tuple + */ +std::tuple SST_cudaGetParamConfig( + uint64_t hostFun, unsigned index) { + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + CUctx_st *context = GPGPUSim_Context(GPGPU_Context()); + function_info *entry = context->get_kernel((char *)hostFun); + cudaError_t result = cudaSuccess; + size_t size = 0; + unsigned alignment = 0; + if (index >= entry->num_args()) { + result = cudaErrorAssert; + } else { + std::pair p = entry->get_param_config(index); + size = p.first; + alignment = p.second; + } + return std::tuple(result, size, alignment); +} + +extern "C" { +void SST_receive_mem_reply(unsigned core_id, void *mem_req) { + CUctx_st *context = GPGPUSim_Context(GPGPU_Context()); + static_cast(context->get_device()->get_gpgpu()) + ->SST_receive_mem_reply(core_id, mem_req); + // printf("GPGPU-sim: Recived Request\n"); +} + +bool SST_gpu_core_cycle() { return SST_Cycle(); } + +void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) { + CUctx_st *context = GPGPUSim_Context(GPGPU_Context()); + static_cast(context->get_device()->get_gpgpu()) + ->SST_gpgpusim_numcores_equal_check(sst_numcores); +} + +uint64_t cudaMallocSST(void **devPtr, size_t size) { + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + void *test_malloc; + test_malloc = (void *)malloc(size); + void **test_malloc2 = &test_malloc; + CUctx_st *context = GPGPUSim_Context(GPGPU_Context()); + *test_malloc2 = context->get_device()->get_gpgpu()->gpu_malloc(size); + printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n", size, + (unsigned long long)*test_malloc2); + if (g_debug_execution >= 3) + printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n", + size, (unsigned long long)*test_malloc2); + return (uint64_t)*test_malloc2; +} + +__host__ cudaError_t CUDARTAPI cudaMallocHostSST(void *addr, size_t size) { + return cudaMallocHostSSTInternal(addr, size); +} + cudaError_t cudaPeekAtLastError(void) { return g_last_cudaError; } __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) { @@ -2534,6 +2668,7 @@ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( * * * * *******************************************************************************/ + __host__ cudaError_t CUDARTAPI cudaMemset(void *mem, int c, size_t count) { return cudaMemsetInternal(mem, c, count); } @@ -2754,11 +2889,32 @@ __host__ const char *CUDARTAPI cudaGetErrorString(cudaError_t error) { return strdup(buf); } +// SST specific cuda apis +__host__ cudaError_t CUDARTAPI cudaSetupArgumentSST(uint64_t arg, + uint8_t value[200], + size_t size, + size_t offset) { + void *local_value; + local_value = (void *)malloc(size); + + if (arg) { + memcpy(local_value, (void *)&arg, size); + } else { + memcpy(local_value, value, size); + } + return cudaSetupArgumentInternal(local_value, size, offset); +} + __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset) { return cudaSetupArgumentInternal(arg, size, offset); } +// SST specific cuda apis +__host__ cudaError_t CUDARTAPI cudaLaunchSST(uint64_t hostFun) { + return cudaLaunchInternal((char *)hostFun); +} + __host__ cudaError_t CUDARTAPI cudaLaunch(const char *hostFun) { return cudaLaunchInternal(hostFun); } @@ -2933,6 +3089,27 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) { return cudaThreadSynchronizeInternal(); } +__host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) { + // For SST, perform a one-time check and let SST_Cycle() + // do the polling test and invoke callback to SST + // to signal ThreadSynchonize done + gpgpu_context *ctx = GPGPU_Context(); + if (g_debug_execution >= 3) { + announce_call(__my_func__); + } + + // Called on host side + bool thread_sync_done = ctx->synchronize_check(); + g_last_cudaError = cudaSuccess; + if (thread_sync_done) { + // We are already done, so no need to poll for sync done + ctx->requested_synchronize = false; + return cudaSuccess; + } else { + return cudaErrorNotReady; + } +} + int CUDARTAPI __cudaSynchronizeThreads(void **, void *) { if (g_debug_execution >= 3) { announce_call(__my_func__); @@ -2992,10 +3169,10 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable( // extracts all ptx files from binary and dumps into // prog_name.unique_no.sm_<>.ptx files -void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) { +void cuda_runtime_api::extract_ptx_files_using_cuobjdump_internal( + CUctx_st *context, std::string &app_binary) { char command[1000]; char *pytorch_bin = getenv("PYTORCH_BIN"); - std::string app_binary = get_app_binary(); char ptx_list_file_name[1024]; snprintf(ptx_list_file_name, 1024, "_cuobjdump_list_ptx_XXXXXX"); @@ -3062,6 +3239,17 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) { } } +void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context, + const char *fn) { + std::string app_binary = get_app_binary(fn); + this->extract_ptx_files_using_cuobjdump_internal(context, app_binary); +} + +void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) { + std::string app_binary = get_app_binary(); + this->extract_ptx_files_using_cuobjdump_internal(context, app_binary); +} + //! Call cuobjdump to extract everything (-elf -sass -ptx) /*! * This Function extract the whole PTX (for all the files) using cuobjdump @@ -3069,13 +3257,12 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) { *with each binary in its own file It is also responsible for extracting the *libraries linked to the binary if the option is enabled * */ -void cuda_runtime_api::extract_code_using_cuobjdump() { - CUctx_st *context = GPGPUSim_Context(gpgpu_ctx); - +void cuda_runtime_api::extract_code_using_cuobjdump_internal( + CUctx_st *context, std::string &app_binary, + std::function ctx_extract_ptx_func) { // prevent the dumping by cuobjdump everytime we execute the code! const char *override_cuobjdump = getenv("CUOBJDUMP_SIM_FILE"); char command[1000]; - std::string app_binary = get_app_binary(); // Running cuobjdump using dynamic link to current process snprintf(command, 1000, "md5sum %s ", app_binary.c_str()); printf("Running md5sum using \"%s\"\n", command); @@ -3090,7 +3277,7 @@ void cuda_runtime_api::extract_code_using_cuobjdump() { // used by ptxas. int result = 0; #if (CUDART_VERSION >= 6000) - extract_ptx_files_using_cuobjdump(context); + ctx_extract_ptx_func(context); return; #endif // TODO: redundant to dump twice. how can it be prevented? @@ -3222,6 +3409,26 @@ void cuda_runtime_api::extract_code_using_cuobjdump() { } } +void cuda_runtime_api::extract_code_using_cuobjdump(const char *fn) { + CUctx_st *context = GPGPUSim_Context(gpgpu_ctx); + std::string app_binary = get_app_binary(fn); + auto ctx_extract_ptx_func = [=](CUctx_st *context) { + extract_ptx_files_using_cuobjdump(context, fn); + }; + extract_code_using_cuobjdump_internal(context, app_binary, + ctx_extract_ptx_func); +} + +void cuda_runtime_api::extract_code_using_cuobjdump() { + CUctx_st *context = GPGPUSim_Context(gpgpu_ctx); + std::string app_binary = get_app_binary(); + auto ctx_extract_ptx_func = [=](CUctx_st *context) { + extract_ptx_files_using_cuobjdump(context); + }; + extract_code_using_cuobjdump_internal(context, app_binary, + ctx_extract_ptx_func); +} + //! Read file into char* // TODO: convert this to C++ streams, will be way cleaner char *readfile(const std::string filename) { @@ -3466,10 +3673,11 @@ cuobjdumpPTXSection *cuda_runtime_api::findPTXSection( } //! Extract the code using cuobjdump and remove unnecessary sections -void cuda_runtime_api::cuobjdumpInit() { +void cuda_runtime_api::cuobjdumpInit_internal( + std::function ctx_extract_code_func) { CUctx_st *context = GPGPUSim_Context(gpgpu_ctx); - extract_code_using_cuobjdump(); // extract all the output of cuobjdump to - // _cuobjdump_*.* + ctx_extract_code_func(); // extract all the output of cuobjdump to + // _cuobjdump_*.* const char *pre_load = getenv("CUOBJDUMP_SIM_FILE"); if (pre_load == NULL || strlen(pre_load) == 0) { cuobjdumpSectionList = pruneSectionList(context); @@ -3477,6 +3685,16 @@ void cuda_runtime_api::cuobjdumpInit() { } } +void cuda_runtime_api::cuobjdumpInit(const char *fn) { + auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(fn); }; + cuobjdumpInit_internal(ctx_extract_code_func); +} + +void cuda_runtime_api::cuobjdumpInit() { + auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(); }; + cuobjdumpInit_internal(ctx_extract_code_func); +} + //! Either submit PTX for simulation or convert SASS to PTXPlus and submit it void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) { CUctx_st *context = GPGPUSim_Context(this); @@ -3587,6 +3805,10 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) { extern "C" { +void **CUDARTAPI __cudaRegisterFatBinarySST(const char *fn) { + return cudaRegisterFatBinaryInternal(fn, NULL); +} + void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) { if (g_debug_execution >= 3) { announce_call(__my_func__); @@ -3619,6 +3841,14 @@ cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim, return g_last_cudaError = cudaSuccess; } +void CUDARTAPI __cudaRegisterFunctionSST(unsigned fatCubinHandle, + uint64_t hostFun, + char deviceFun[512]) { + cudaRegisterFunctionInternal((void **)fatCubinHandle, (const char *)hostFun, + (char *)deviceFun, NULL, NULL, NULL, NULL, NULL, + NULL); +} + void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, diff --git a/libcuda/gpgpu_context.h b/libcuda/gpgpu_context.h index d0cd7c48c..5ef21177b 100644 --- a/libcuda/gpgpu_context.h +++ b/libcuda/gpgpu_context.h @@ -44,6 +44,9 @@ class gpgpu_context { s_g_pc_to_insn; // a direct mapping from PC to instruction bool debug_tensorcore; + // SST related + bool requested_synchronize = false; + // objects pointers for each file cuda_runtime_api *api; ptxinfo_data *ptxinfo; @@ -54,6 +57,7 @@ class gpgpu_context { ptx_stats *stats; // member function list void synchronize(); + bool synchronize_check(); void exit_simulation(); void print_simulation_time(); int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid); diff --git a/setup_environment b/setup_environment index 342810151..2fac1b991 100644 --- a/setup_environment +++ b/setup_environment @@ -46,7 +46,6 @@ fi CC_VERSION=$(gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]+\.[0-9]+\.[0-9]+$/)) {print $i; exit 0}}}') - CUDA_VERSION_STRING=`$CUDA_INSTALL_PATH/bin/nvcc --version | awk '/release/ {print $5;}' | sed 's/,//'`; export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($1), 10*$2);}'` if [ $CUDA_VERSION_NUMBER -gt 11100 -o $CUDA_VERSION_NUMBER -lt 2030 ]; then @@ -61,13 +60,18 @@ if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then export CUOBJDUMP_SIM_FILE=jj fi +# Simple configure, loop through all positional arguments +# Default config +export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release -if [ $# = '1' ] ; -then - export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1 -else - export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release -fi +for opt in $@ +do + if [[ $opt == 'debug' ]] ; then + # Debug mode + echo -n "enabled debug mode " + export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1 + fi +done export QTINC=/usr/include diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc index 833d33f5c..2fd90c0e5 100644 --- a/src/cuda-sim/cuda-sim.cc +++ b/src/cuda-sim/cuda-sim.cc @@ -1305,7 +1305,12 @@ void function_info::add_param_name_type_size(unsigned index, std::string name, void function_info::add_param_data(unsigned argn, struct gpgpu_ptx_sim_arg *args) { const void *data = args->m_start; - + if (g_debug_execution >= 3) { + if (args->m_nbytes == 4) + printf("ADD_PARAM_DATA %d\n", *((uint32_t *)data)); + else + printf("ADD_PARAM_DATA %p\n", *((void **)data)); + } bool scratchpad_memory_param = false; // Is this parameter in CUDA shared memory or OpenCL local memory @@ -1746,6 +1751,17 @@ static unsigned get_tex_datasize(const ptx_instruction *pI, ptx_thread_info *thread) { const operand_info &src1 = pI->src1(); // the name of the texture std::string texname = src1.name(); + // If indirect access, use register's value as address + // to find the symbol + if (src1.is_reg()) { + const operand_info &dst = pI->dst(); + ptx_reg_t src1_data = + thread->get_operand_value(src1, dst, pI->get_type(), thread, 1); + addr_t sym_addr = src1_data.u64; + symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr); + assert(texRef != NULL); + texname = texRef->name(); + } /* For programs with many streams, textures can be bound and unbound @@ -2285,15 +2301,24 @@ void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src, sym_name = g->second; mem_region = global_space; } - if (g_globals.find(hostVar) != g_globals.end()) { - found_sym = true; - sym_name = hostVar; - mem_region = global_space; - } - if (g_constants.find(hostVar) != g_constants.end()) { - found_sym = true; - sym_name = hostVar; - mem_region = const_space; + + // Weili: Only attempt to find symbol as it is a string + // if we could not find it in previously registered variable. + // This will avoid constructing std::string() from hostVar address + // where it is not a string as + // Use of a string naming a variable as the symbol parameter was deprecated in + // CUDA 4.1 and removed in CUDA 5.0. + if (!found_sym) { + if (g_globals.find(hostVar) != g_globals.end()) { + found_sym = true; + sym_name = hostVar; + mem_region = global_space; + } + if (g_constants.find(hostVar) != g_constants.end()) { + found_sym = true; + sym_name = hostVar; + mem_region = const_space; + } } if (!found_sym) { diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc index 108de9759..843bf0ba7 100644 --- a/src/cuda-sim/instructions.cc +++ b/src/cuda-sim/instructions.cc @@ -6055,6 +6055,17 @@ void tex_impl(const ptx_instruction *pI, ptx_thread_info *thread) { // to be fetched std::string texname = src1.name(); + // If indirect access, use register's value as address + // to find the symbol + if (src1.is_reg()) { + ptx_reg_t src1_data = + thread->get_operand_value(src1, dst, pI->get_type(), thread, 1); + addr_t sym_addr = src1_data.u64; + symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr); + assert(texRef != NULL); + texname = texRef->name(); + } + unsigned to_type = pI->get_type(); unsigned c_type = pI->get_type2(); fflush(stdout); diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc index 139920930..4e500ccb4 100644 --- a/src/cuda-sim/ptx_ir.cc +++ b/src/cuda-sim/ptx_ir.cc @@ -139,6 +139,22 @@ symbol *symbol_table::lookup(const char *identifier) { return NULL; } +symbol *symbol_table::lookup_by_addr(addr_t addr) { + for (auto it = m_symbols.begin(); it != m_symbols.end(); ++it) { + symbol *sym = it->second; + + // check if symbol has the addr to be found + if ((!sym->is_reg()) && (sym->has_valid_address()) && + (sym->get_address() == addr)) { + return sym; + } + } + if (m_parent) { + return m_parent->lookup_by_addr(addr); + } + return NULL; +} + symbol *symbol_table::add_variable(const char *identifier, const type_info *type, unsigned size, const char *filename, unsigned line) { diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h index d253866db..b08a692d8 100644 --- a/src/cuda-sim/ptx_ir.h +++ b/src/cuda-sim/ptx_ir.h @@ -205,6 +205,7 @@ class symbol { const std::string &name() const { return m_name; } const std::string &decl_location() const { return m_decl_location; } const type_info *type() const { return m_type; } + bool has_valid_address() const { return m_address_valid; } addr_t get_address() const { assert(m_is_label || !m_type->get_key().is_reg()); // todo : other assertions @@ -310,6 +311,7 @@ class symbol_table { void set_ptx_version(float ver, unsigned ext); void set_sm_target(const char *target, const char *ext, const char *ext2); symbol *lookup(const char *identifier); + symbol *lookup_by_addr(addr_t addr); std::string get_scope_name() const { return m_scope_name; } symbol *add_variable(const char *identifier, const type_info *type, unsigned size, const char *filename, unsigned line); diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h index f0c26efc8..8eec922e4 100644 --- a/src/cuda-sim/ptx_sim.h +++ b/src/cuda-sim/ptx_sim.h @@ -459,6 +459,9 @@ class ptx_thread_info { // Jin: get corresponding kernel grid for CDP purpose kernel_info_t &get_kernel() { return m_kernel; } + // Weili: access symbol_table + symbol_table *get_symbol_table() { return m_symbol_table; } + public: addr_t m_last_effective_address; bool m_branch_taken; diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc index cd3c88033..0ea9ff63d 100644 --- a/src/gpgpu-sim/gpu-cache.cc +++ b/src/gpgpu-sim/gpu-cache.cc @@ -2062,6 +2062,7 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf, void tex_cache::cycle() { // send next request to lower level of memory + // TODO: Use different full() for sst_mem_interface? if (!m_request_fifo.empty()) { mem_fetch *mf = m_request_fifo.peek(); if (!m_memport->full(mf->get_ctrl_size(), false)) { diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 5bd41805d..b92494b43 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -319,6 +319,9 @@ void memory_config::reg_options(class OptionParser *opp) { "elimnate_rw_turnaround i.e set tWTR and tRTW = 0", "0"); option_parser_register(opp, "-icnt_flit_size", OPT_UINT32, &icnt_flit_size, "icnt_flit_size", "32"); + // SST mode activate + option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode", + "0"); m_address_mapping.addrdec_setoption(opp); } @@ -955,6 +958,16 @@ void exec_gpgpu_sim::createSIMTCluster() { m_shader_stats, m_memory_stats); } +// SST get its own simt_cluster +void sst_gpgpu_sim::createSIMTCluster() { + m_cluster = new simt_core_cluster *[m_shader_config->n_simt_clusters]; + for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) + m_cluster[i] = + new sst_simt_core_cluster(this, i, m_shader_config, m_memory_config, + m_shader_stats, m_memory_stats); + SST_gpgpu_reply_buffer.resize(m_shader_config->n_simt_clusters); +} + gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) : gpgpu_t(config, ctx), m_config(config) { gpgpu_ctx = ctx; @@ -999,26 +1012,29 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) gpu_kernel_time.clear(); - m_memory_partition_unit = - new memory_partition_unit *[m_memory_config->m_n_mem]; - m_memory_sub_partition = - new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition]; - for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) { - m_memory_partition_unit[i] = - new memory_partition_unit(i, m_memory_config, m_memory_stats, this); - for (unsigned p = 0; - p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) { - unsigned submpid = - i * m_memory_config->m_n_sub_partition_per_memory_channel + p; - m_memory_sub_partition[submpid] = - m_memory_partition_unit[i]->get_sub_partition(p); + // TODO: somehow move this logic to the sst_gpgpu_sim constructor? + if (!m_config.is_SST_mode()) { + // Init memory if not in SST mode + m_memory_partition_unit = + new memory_partition_unit *[m_memory_config->m_n_mem]; + m_memory_sub_partition = + new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition]; + for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) { + m_memory_partition_unit[i] = + new memory_partition_unit(i, m_memory_config, m_memory_stats, this); + for (unsigned p = 0; + p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) { + unsigned submpid = + i * m_memory_config->m_n_sub_partition_per_memory_channel + p; + m_memory_sub_partition[submpid] = + m_memory_partition_unit[i]->get_sub_partition(p); + } } - } - - icnt_wrapper_init(); - icnt_create(m_shader_config->n_simt_clusters, - m_memory_config->m_n_mem_sub_partition); + icnt_wrapper_init(); + icnt_create(m_shader_config->n_simt_clusters, + m_memory_config->m_n_mem_sub_partition); + } time_vector_create(NUM_MEM_REQ_STAT); fprintf(stdout, "GPGPU-Sim uArch: performance model initialization complete.\n"); @@ -1037,6 +1053,22 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) m_functional_sim_kernel = NULL; } +void sst_gpgpu_sim::SST_receive_mem_reply(unsigned core_id, void *mem_req) { + assert(core_id < m_shader_config->n_simt_clusters); + mem_fetch *mf = (mem_fetch *)mem_req; + + (SST_gpgpu_reply_buffer[core_id]).push_back(mf); +} + +mem_fetch *sst_gpgpu_sim::SST_pop_mem_reply(unsigned core_id) { + if (SST_gpgpu_reply_buffer[core_id].size() > 0) { + mem_fetch *temp = SST_gpgpu_reply_buffer[core_id].front(); + SST_gpgpu_reply_buffer[core_id].pop_front(); + return temp; + } else + return NULL; +} + int gpgpu_sim::shared_mem_size() const { return m_shader_config->gpgpu_shmem_size; } @@ -1132,6 +1164,26 @@ bool gpgpu_sim::active() { return false; } +bool sst_gpgpu_sim::active() { + if (m_config.gpu_max_cycle_opt && + (gpu_tot_sim_cycle + gpu_sim_cycle) >= m_config.gpu_max_cycle_opt) + return false; + if (m_config.gpu_max_insn_opt && + (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt) + return false; + if (m_config.gpu_max_cta_opt && + (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt)) + return false; + if (m_config.gpu_max_completed_cta_opt && + (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt)) + return false; + if (m_config.gpu_deadlock_detect && gpu_deadlock) return false; + for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) + if (m_cluster[i]->get_not_completed() > 0) return true; + if (get_more_cta_left()) return true; + return false; +} + void gpgpu_sim::init() { // run a CUDA grid on the GPU microarchitecture simulator gpu_sim_cycle = 0; @@ -2157,6 +2209,11 @@ void gpgpu_sim::cycle() { } } +void sst_gpgpu_sim::cycle() { + SST_cycle(); + return; +} + void shader_core_ctx::dump_warp_state(FILE *fout) const { fprintf(fout, "\n"); fprintf(fout, "per warp functional simulation status:\n"); @@ -2236,3 +2293,110 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() { const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; } simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; } + +void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) { + if (m_shader_config->n_simt_clusters != sst_numcores) { + assert( + "\nSST core is not equal the GPGPU-sim cores. Open gpgpu-sim.config " + "file and ensure n_simt_clusters" + "is the same as SST gpu cores.\n" && + 0); + } else { + printf("\nSST GPU core is equal the GPGPU-sim cores = %d\n", sst_numcores); + } +} + +void sst_gpgpu_sim::SST_cycle() { + // shader core loading (pop from ICNT into core) follows CORE clock + for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) + static_cast(m_cluster[i])->icnt_cycle_SST(); + + // L1 cache + shader core pipeline stages + m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear(); + for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) { + if (m_cluster[i]->get_not_completed() || get_more_cta_left()) { + m_cluster[i]->core_cycle(); + *active_sms += m_cluster[i]->get_n_active_sms(); + } + // Update core icnt/cache stats for GPUWattch + m_cluster[i]->get_icnt_stats( + m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]); + m_cluster[i]->get_cache_stats( + m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]); + } + float temp = 0; + for (unsigned i = 0; i < m_shader_config->num_shader(); i++) { + temp += m_shader_stats->m_pipeline_duty_cycle[i]; + } + temp = temp / m_shader_config->num_shader(); + *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp); + // cout<<"Average pipeline duty cycle: "<<*average_pipeline_duty_cycle<= g_single_step)) { + asm("int $03"); + } + gpu_sim_cycle++; + if (g_interactive_debugger_enabled) gpgpu_debug(); + + // McPAT main cycle (interface with McPAT) +#ifdef GPGPUSIM_POWER_MODEL + if (m_config.g_power_simulation_enabled) { + mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper, + m_power_stats, m_config.gpu_stat_sample_freq, gpu_tot_sim_cycle, + gpu_sim_cycle, gpu_tot_sim_insn, gpu_sim_insn, + m_config.g_dvfs_enabled); + } +#endif + + issue_block2core(); + + if (!(gpu_sim_cycle % m_config.gpu_stat_sample_freq)) { + time_t days, hrs, minutes, sec; + time_t curr_time; + time(&curr_time); + unsigned long long elapsed_time = + MAX(curr_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1); + if ((elapsed_time - last_liveness_message_time) >= + m_config.liveness_message_freq) { + days = elapsed_time / (3600 * 24); + hrs = elapsed_time / 3600 - 24 * days; + minutes = elapsed_time / 60 - 60 * (hrs + 24 * days); + sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days)); + + last_liveness_message_time = elapsed_time; + } + visualizer_printstat(); + m_memory_stats->memlatstat_lat_pw(); + if (m_config.gpgpu_runtime_stat && (m_config.gpu_runtime_stat_flag != 0)) { + if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_BW_STAT) { + for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) + m_memory_partition_unit[i]->print_stat(stdout); + printf("maxmrqlatency = %d \n", m_memory_stats->max_mrq_latency); + printf("maxmflatency = %d \n", m_memory_stats->max_mf_latency); + } + if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SHD_INFO) + shader_print_runtime_stat(stdout); + if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_L1MISS) + shader_print_l1_miss_stat(stdout); + if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SCHED) + shader_print_scheduler_stat(stdout, false); + } + } + + if (!(gpu_sim_cycle % 20000)) { + // deadlock detection + if (m_config.gpu_deadlock_detect && gpu_sim_insn == last_gpu_sim_insn) { + gpu_deadlock = true; + } else { + last_gpu_sim_insn = gpu_sim_insn; + } + } + try_snap_shot(gpu_sim_cycle); + spill_log_to_file(stdout, 0, gpu_sim_cycle); + +#if (CUDART_VERSION >= 5000) + // launch device kernel + gpgpu_ctx->device_runtime->launch_one_device_kernel(); +#endif +} diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index 8e81451b6..d0c2a1763 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -69,6 +69,38 @@ class gpgpu_context; extern tr1_hash_map address_random_interleaving; +// SST communication functions +/** + * @brief Check if SST requests buffer is full + * + * @param core_id + * @return true + * @return false + */ +extern bool is_SST_buffer_full(unsigned core_id); + +/** + * @brief Send loads to SST memory backend + * + * @param core_id + * @param address + * @param size + * @param mem_req + */ +extern void send_read_request_SST(unsigned core_id, uint64_t address, + size_t size, void *mem_req); + +/** + * @brief Send stores to SST memory backend + * + * @param core_id + * @param address + * @param size + * @param mem_req + */ +extern void send_write_request_SST(unsigned core_id, uint64_t address, + size_t size, void *mem_req); + enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 }; enum hw_perf_t { @@ -274,6 +306,14 @@ class memory_config { } void reg_options(class OptionParser *opp); + /** + * @brief Check if the config script is in SST mode + * + * @return true + * @return false + */ + bool is_SST_mode() const { return SST_mode; } + bool m_valid; mutable l2_cache_config m_L2_config; bool m_L2_texure_only; @@ -351,7 +391,7 @@ class memory_config { unsigned write_low_watermark; bool m_perf_sim_memcpy; bool simple_dram_model; - + bool SST_mode; gpgpu_context *gpgpu_ctx; }; @@ -398,6 +438,15 @@ class gpgpu_sim_config : public power_config, unsigned num_shader() const { return m_shader_config.num_shader(); } unsigned num_cluster() const { return m_shader_config.n_simt_clusters; } unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; } + + /** + * @brief Check if we are in SST mode + * + * @return true + * @return false + */ + bool is_SST_mode() const { return m_memory_config.SST_mode; } + unsigned checkpoint_option; size_t stack_limit() const { return stack_size_limit; } @@ -462,6 +511,7 @@ class gpgpu_sim_config : public power_config, unsigned long long liveness_message_freq; friend class gpgpu_sim; + friend class sst_gpgpu_sim; }; struct occupancy_stats { @@ -600,10 +650,18 @@ class gpgpu_sim : public gpgpu_t { void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd, const ptx_instruction *pI); + /** + * @brief Check if we are in SST mode + * + * @return true + * @return false + */ + bool is_SST_mode() { return m_config.is_SST_mode(); } + // backward pointer class gpgpu_context *gpgpu_ctx; - private: + protected: // clocks void reinit_clock_domains(void); int next_clock_domain(void); @@ -715,7 +773,7 @@ class gpgpu_sim : public gpgpu_t { void set_cache_config(std::string kernel_name); // Jin: functional simulation for CDP - private: + protected: // set by stream operation every time a functoinal simulation is done bool m_functional_sim; kernel_info_t *m_functional_sim_kernel; @@ -748,4 +806,79 @@ class exec_gpgpu_sim : public gpgpu_sim { virtual void createSIMTCluster(); }; +/** + * @brief A GPGPUSim class customized to SST Balar interfacing + * + */ +class sst_gpgpu_sim : public gpgpu_sim { + public: + sst_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) + : gpgpu_sim(config, ctx) { + createSIMTCluster(); + } + + // SST memory handling + std::vector> + SST_gpgpu_reply_buffer; /** SST mem response queue */ + + /** + * @brief Receive mem request's response from SST and put + * it in a buffer (SST_gpgpu_reply_buffer) + * + * @param core_id + * @param mem_req + */ + void SST_receive_mem_reply(unsigned core_id, void *mem_req); + + /** + * @brief Pop the head of the buffer queue to get the + * memory response + * + * @param core_id + * @return mem_fetch* + */ + mem_fetch *SST_pop_mem_reply(unsigned core_id); + + virtual void createSIMTCluster(); + + // SST Balar interfacing + /** + * @brief Advance core and collect stats + * + */ + void SST_cycle(); + + /** + * @brief Wrapper of SST_cycle() + * + */ + void cycle(); + + /** + * @brief Whether the GPU is active, removed test for + * memory system since that is handled in SST + * + * @return true + * @return false + */ + bool active(); + + /** + * @brief SST mode use SST memory system instead, so the memcpy + * is empty here + * + * @param dst_start_addr + * @param count + */ + void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){}; + + /** + * @brief Check if the SST config matches up with the + * gpgpusim.config in core number + * + * @param sst_numcores SST core count + */ + void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores); +}; + #endif diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc index 7211a7dd3..809c92081 100644 --- a/src/gpgpu-sim/mem_fetch.cc +++ b/src/gpgpu-sim/mem_fetch.cc @@ -54,9 +54,15 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst, m_sid = sid; m_tpc = tpc; m_wid = wid; - config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr); - m_partition_addr = - config->m_address_mapping.partition_address(access.get_addr()); + + if (!config->is_SST_mode()) { + // In SST memory model, the SST memory hierarchy is + // responsible to generate the correct address mapping + config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr); + m_partition_addr = + config->m_address_mapping.partition_address(access.get_addr()); + } + m_type = m_access.is_write() ? WRITE_REQUEST : READ_REQUEST; m_timestamp = cycle; m_timestamp2 = 0; diff --git a/src/gpgpu-sim/mem_latency_stat.cc b/src/gpgpu-sim/mem_latency_stat.cc index 63d7ee80c..c77a68648 100644 --- a/src/gpgpu-sim/mem_latency_stat.cc +++ b/src/gpgpu-sim/mem_latency_stat.cc @@ -203,7 +203,15 @@ unsigned memory_stats_t::memlatstat_done(mem_fetch *mf) { } void memory_stats_t::memlatstat_read_done(mem_fetch *mf) { - if (m_memory_config->gpgpu_memlatency_stat) { + if (m_memory_config->SST_mode) { + // in SST mode, we just calculate mem latency + unsigned mf_latency; + mf_latency = + (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) - mf->get_timestamp(); + num_mfs++; + mf_total_lat += mf_latency; + if (mf_latency > max_mf_latency) max_mf_latency = mf_latency; + } else if (m_memory_config->gpgpu_memlatency_stat) { unsigned mf_latency = memlatstat_done(mf); if (mf_latency > mf_max_lat_table[mf->get_tlx_addr().chip][mf->get_tlx_addr().bk]) @@ -273,7 +281,12 @@ void memory_stats_t::memlatstat_print(unsigned n_mem, unsigned gpu_mem_n_bk) { unsigned max_bank_accesses, min_bank_accesses, max_chip_accesses, min_chip_accesses; - if (m_memory_config->gpgpu_memlatency_stat) { + if (m_memory_config->SST_mode) { + // in SST mode, we just calculate mem latency + printf("max_mem_SST_latency = %d \n", max_mf_latency); + if (num_mfs) + printf("average_mf_SST_latency = %lld \n", mf_total_lat / num_mfs); + } else if (m_memory_config->gpgpu_memlatency_stat) { printf("maxmflatency = %d \n", max_mf_latency); printf("max_icnt2mem_latency = %d \n", max_icnt2mem_latency); printf("maxmrqlatency = %d \n", max_mrq_latency); diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 4d4f11277..7482e0ef9 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -162,7 +162,10 @@ void shader_core_ctx::create_front_pipeline() { } // m_icnt = new shader_memory_interface(this,cluster); - if (m_config->gpgpu_perfect_mem) { + if (m_memory_config->SST_mode) { + m_icnt = new sst_memory_interface( + this, static_cast(m_cluster)); + } else if (m_config->gpgpu_perfect_mem) { m_icnt = new perfect_memory_interface(this, m_cluster); } else { m_icnt = new shader_memory_interface(this, m_cluster); @@ -2281,7 +2284,15 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst, inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE; unsigned size = access.get_size() + control_size; // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size); - if (m_icnt->full(size, inst.is_store() || inst.isatomic())) { + if (m_memory_config->SST_mode && + (static_cast(m_icnt)->full( + size, inst.is_store() || inst.isatomic(), access.get_type()))) { + // SST need mf type here + // Cast it to sst_memory_interface pointer first as this full() method + // is not a virtual method in parent class + stall_cond = ICNT_RC_FAIL; + } else if (!m_memory_config->SST_mode && + (m_icnt->full(size, inst.is_store() || inst.isatomic()))) { stall_cond = ICNT_RC_FAIL; } else { mem_fetch *mf = @@ -2846,7 +2857,10 @@ void ldst_unit::cycle() { } } else { if (mf->get_type() == WRITE_ACK || - (m_config->gpgpu_perfect_mem && mf->get_is_write())) { + ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) && + mf->get_is_write())) { + // SST memory is handled by SST mem hierarchy + // Perfect mem m_core->store_ack(mf); m_response_fifo.pop_front(); delete mf; @@ -4020,7 +4034,8 @@ void shader_core_ctx::accept_ldst_unit_response(mem_fetch *mf) { void shader_core_ctx::store_ack(class mem_fetch *mf) { assert(mf->get_type() == WRITE_ACK || - (m_config->gpgpu_perfect_mem && mf->get_is_write())); + ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) && + mf->get_is_write())); unsigned warp_id = mf->get_wid(); m_warp[warp_id]->dec_store_req(); } @@ -4573,7 +4588,46 @@ bool simt_core_cluster::icnt_injection_buffer_full(unsigned size, bool write) { return !::icnt_has_buffer(m_cluster_id, request_size); } +bool sst_simt_core_cluster::SST_injection_buffer_full(unsigned size, bool write, + mem_access_type type) { + switch (type) { + case CONST_ACC_R: + case INST_ACC_R: { + return response_queue_full(); + break; + } + default: { + return ::is_SST_buffer_full(m_cluster_id); + break; + } + } +} + void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) { + // Update stats based on mf type + update_icnt_stats(mf); + + // The packet size varies depending on the type of request: + // - For write request and atomic request, the packet contains the data + // - For read request (i.e. not write nor atomic), the packet only has control + // metadata + unsigned int packet_size = mf->size(); + if (!mf->get_is_write() && !mf->isatomic()) { + packet_size = mf->get_ctrl_size(); + } + m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size); + unsigned destination = mf->get_sub_partition_id(); + mf->set_status(IN_ICNT_TO_MEM, + m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); + if (!mf->get_is_write() && !mf->isatomic()) + ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf, + mf->get_ctrl_size()); + else + ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf, + mf->size()); +} + +void simt_core_cluster::update_icnt_stats(class mem_fetch *mf) { // stats if (mf->get_is_write()) m_stats->made_write_mfs++; @@ -4618,6 +4672,12 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) { default: assert(0); } +} + +void sst_simt_core_cluster::icnt_inject_request_packet_to_SST( + class mem_fetch *mf) { + // Update stats + update_icnt_stats(mf); // The packet size varies depending on the type of request: // - For write request and atomic request, the packet contains the data @@ -4628,15 +4688,25 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) { packet_size = mf->get_ctrl_size(); } m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size); - unsigned destination = mf->get_sub_partition_id(); mf->set_status(IN_ICNT_TO_MEM, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); - if (!mf->get_is_write() && !mf->isatomic()) - ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf, - mf->get_ctrl_size()); - else - ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf, - mf->size()); + switch (mf->get_access_type()) { + case CONST_ACC_R: + case INST_ACC_R: { + push_response_fifo(mf); + break; + } + default: { + if (!mf->get_is_write() && !mf->isatomic()) + ::send_read_request_SST(m_cluster_id, mf->get_addr(), + mf->get_data_size(), (void *)mf); + else + ::send_write_request_SST(m_cluster_id, mf->get_addr(), + mf->get_data_size(), (void *)mf); + + break; + } + } } void simt_core_cluster::icnt_cycle() { @@ -4678,6 +4748,49 @@ void simt_core_cluster::icnt_cycle() { } } +void sst_simt_core_cluster::icnt_cycle_SST() { + if (!m_response_fifo.empty()) { + mem_fetch *mf = m_response_fifo.front(); + unsigned cid = m_config->sid_to_cid(mf->get_sid()); + if (mf->get_access_type() == INST_ACC_R) { + // instruction fetch response + if (!m_core[cid]->fetch_unit_response_buffer_full()) { + m_response_fifo.pop_front(); + m_core[cid]->accept_fetch_response(mf); + } + } else { + // data response + if (!m_core[cid]->ldst_unit_response_buffer_full()) { + m_response_fifo.pop_front(); + m_memory_stats->memlatstat_read_done(mf); + m_core[cid]->accept_ldst_unit_response(mf); + } + } + } + + // pop from SST buffers + if (m_response_fifo.size() < m_config->n_simt_ejection_buffer_size) { + mem_fetch *mf = (mem_fetch *)(static_cast(get_gpu()) + ->SST_pop_mem_reply(m_cluster_id)); + if (!mf) return; + assert(mf->get_tpc() == m_cluster_id); + + // do atomic here + // For now, we execute atomic when the mem reply comes back + // This needs to be validated + if (mf && mf->isatomic()) mf->do_atomic(); + + unsigned int packet_size = + (mf->get_is_write()) ? mf->get_ctrl_size() : mf->size(); + m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size); + mf->set_status(IN_CLUSTER_TO_SHADER_QUEUE, + m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); + // m_memory_stats->memlatstat_read_done(mf,m_shader_config->max_warps_per_shader); + m_response_fifo.push_back(mf); + m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false); + } +} + void simt_core_cluster::get_pdom_stack_top_info(unsigned sid, unsigned tid, unsigned *pc, unsigned *rpc) const { diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index e658a14c9..ee10af664 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -2015,6 +2015,7 @@ class shader_core_stats : public shader_core_stats_pod { friend class shader_core_ctx; friend class ldst_unit; friend class simt_core_cluster; + friend class sst_simt_core_cluster; friend class scheduler_unit; friend class TwoLevelScheduler; friend class LooseRoundRobbinScheduler; @@ -2624,6 +2625,7 @@ class simt_core_cluster { void cache_invalidate(); bool icnt_injection_buffer_full(unsigned size, bool write); void icnt_inject_request_packet(class mem_fetch *mf); + void update_icnt_stats(class mem_fetch *mf); // for perfect memory interface bool response_queue_full() { @@ -2685,6 +2687,50 @@ class exec_simt_core_cluster : public simt_core_cluster { virtual void create_shader_core_ctx(); }; +/** + * @brief SST cluster class + * + */ +class sst_simt_core_cluster : public exec_simt_core_cluster { + public: + sst_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id, + const shader_core_config *config, + const memory_config *mem_config, + class shader_core_stats *stats, + class memory_stats_t *mstats) + : exec_simt_core_cluster(gpu, cluster_id, config, mem_config, stats, + mstats) {} + + /** + * @brief Check if SST memory request injection + * buffer is full by using extern + * function is_SST_buffer_full() + * defined in Balar + * + * @param size + * @param write + * @param type + * @return true + * @return false + */ + bool SST_injection_buffer_full(unsigned size, bool write, + mem_access_type type); + + /** + * @brief Send memory request packets to SST + * memory + * + * @param mf + */ + void icnt_inject_request_packet_to_SST(class mem_fetch *mf); + + /** + * @brief Advance ICNT between core and SST + * + */ + void icnt_cycle_SST(); +}; + class shader_memory_interface : public mem_fetch_interface { public: shader_memory_interface(shader_core_ctx *core, simt_core_cluster *cluster) { @@ -2725,6 +2771,61 @@ class perfect_memory_interface : public mem_fetch_interface { simt_core_cluster *m_cluster; }; +/** + * @brief SST memory interface + * + */ +class sst_memory_interface : public mem_fetch_interface { + public: + sst_memory_interface(shader_core_ctx *core, sst_simt_core_cluster *cluster) { + m_core = core; + m_cluster = cluster; + } + /** + * @brief For constant, inst, tex cache access + * + * @param size + * @param write + * @return true + * @return false + */ + virtual bool full(unsigned size, bool write) const { + assert(false && "Use the full() method with access type instead!"); + return true; + } + + /** + * @brief With SST, the core will direct all mem access except for + * constant, tex, and inst reads to SST mem system + * (i.e. not modeling constant mem right now), thus + * requiring the mem_access_type information to be passed in + * + * @param size + * @param write + * @param type + * @return true + * @return false + */ + bool full(unsigned size, bool write, mem_access_type type) const { + return m_cluster->SST_injection_buffer_full(size, write, type); + } + + /** + * @brief Push memory request to SST memory system and + * update stats + * + * @param mf + */ + virtual void push(mem_fetch *mf) { + m_core->inc_simt_to_mem(mf->get_num_flits(true)); + m_cluster->icnt_inject_request_packet_to_SST(mf); + } + + private: + shader_core_ctx *m_core; + sst_simt_core_cluster *m_cluster; +}; + inline int scheduler_unit::get_sid() const { return m_shader->get_sid(); } #endif /* SHADER_H */ diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc index 42c6981b0..839fef619 100644 --- a/src/gpgpusim_entrypoint.cc +++ b/src/gpgpusim_entrypoint.cc @@ -43,6 +43,20 @@ static int sg_argc = 3; static const char *sg_argv[] = {"", "-config", "gpgpusim.config"}; +// Help funcs to avoid multiple '->' for SST +GPGPUsim_ctx *GPGPUsim_ctx_ptr() { return GPGPU_Context()->the_gpgpusim; } + +class sst_gpgpu_sim *g_the_gpu() { + return static_cast(GPGPUsim_ctx_ptr()->g_the_gpu); +} + +class stream_manager *g_stream_manager() { + return GPGPUsim_ctx_ptr()->g_stream_manager; +} + +// SST callback +extern void SST_callback_cudaThreadSynchronize_done(); + void *gpgpu_sim_thread_sequential(void *ctx_ptr) { gpgpu_context *ctx = (gpgpu_context *)ctx_ptr; // at most one kernel running at a time @@ -169,6 +183,75 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) { return NULL; } +bool sst_sim_cycles = false; + +bool SST_Cycle() { + // Check if Synchronize is done when SST previously requested + // cudaThreadSynchronize + if (GPGPU_Context()->requested_synchronize && + ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) || + GPGPUsim_ctx_ptr()->g_sim_done)) { + SST_callback_cudaThreadSynchronize_done(); + GPGPU_Context()->requested_synchronize = false; + } + + if (g_stream_manager()->empty_protected() && + !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) { + GPGPUsim_ctx_ptr()->g_sim_active = false; + // printf("stream is empty %d \n", g_stream_manager->empty()); + return false; + } + + if (g_stream_manager()->operation(&sst_sim_cycles) && + !g_the_gpu()->active()) { + if (sst_sim_cycles) { + sst_sim_cycles = false; + } + return false; + } + + // printf("GPGPU-Sim: Give GPU Cycle\n"); + GPGPUsim_ctx_ptr()->g_sim_active = true; + + // functional simulation + if (g_the_gpu()->is_functional_sim()) { + kernel_info_t *kernel = g_the_gpu()->get_functional_kernel(); + assert(kernel); + GPGPUsim_ctx_ptr()->gpgpu_ctx->func_sim->gpgpu_cuda_ptx_sim_main_func( + *kernel); + g_the_gpu()->finish_functional_sim(kernel); + } + + // performance simulation + if (g_the_gpu()->active()) { + g_the_gpu()->SST_cycle(); + sst_sim_cycles = true; + g_the_gpu()->deadlock_check(); + } else { + if (g_the_gpu()->cycle_insn_cta_max_hit()) { + g_stream_manager()->stop_all_running_kernels(); + GPGPUsim_ctx_ptr()->g_sim_done = true; + GPGPUsim_ctx_ptr()->g_sim_active = false; + GPGPUsim_ctx_ptr()->break_limit = true; + } + } + + if (!g_the_gpu()->active()) { + g_the_gpu()->print_stats(GPGPUsim_ctx_ptr()->g_the_gpu->last_streamID); + g_the_gpu()->update_stats(); + GPGPU_Context()->print_simulation_time(); + } + + if (GPGPUsim_ctx_ptr()->break_limit) { + printf( + "GPGPU-Sim: ** break due to reaching the maximum cycles (or " + "instructions) **\n"); + return true; + } + + return false; +} + void gpgpu_context::synchronize() { printf("GPGPU-Sim: synchronize waiting for inactive GPU simulation\n"); the_gpgpusim->g_stream_manager->print(stdout); @@ -187,6 +270,27 @@ void gpgpu_context::synchronize() { // sem_post(&g_sim_signal_start); } +bool gpgpu_context::synchronize_check() { + // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n"); + requested_synchronize = true; + the_gpgpusim->g_stream_manager->print(stdout); + fflush(stdout); + // sem_wait(&g_sim_signal_finish); + bool done = false; + pthread_mutex_lock(&(the_gpgpusim->g_sim_lock)); + done = (the_gpgpusim->g_stream_manager->empty() && + !the_gpgpusim->g_sim_active) || + the_gpgpusim->g_sim_done; + pthread_mutex_unlock(&(the_gpgpusim->g_sim_lock)); + if (done) { + printf( + "GPGPU-Sim: synchronize checking: detected inactive GPU simulation " + "thread\n"); + } + fflush(stdout); + return done; +} + void gpgpu_context::exit_simulation() { the_gpgpusim->g_sim_done = true; printf("GPGPU-Sim: exit_simulation called\n"); @@ -220,8 +324,14 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() { assert(setlocale(LC_NUMERIC, "C")); the_gpgpusim->g_the_gpu_config->init(); - the_gpgpusim->g_the_gpu = - new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this); + if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) { + // Create SST specific GPGPUSim + the_gpgpusim->g_the_gpu = + new sst_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this); + } else { + the_gpgpusim->g_the_gpu = + new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this); + } the_gpgpusim->g_stream_manager = new stream_manager( (the_gpgpusim->g_the_gpu), func_sim->g_cuda_launch_blocking); @@ -237,12 +347,17 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() { void gpgpu_context::start_sim_thread(int api) { if (the_gpgpusim->g_sim_done) { the_gpgpusim->g_sim_done = false; - if (api == 1) { - pthread_create(&(the_gpgpusim->g_simulation_thread), NULL, - gpgpu_sim_thread_concurrent, (void *)this); + if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) { + // Do not create concurrent thread in SST mode + g_the_gpu()->init(); } else { - pthread_create(&(the_gpgpusim->g_simulation_thread), NULL, - gpgpu_sim_thread_sequential, (void *)this); + if (api == 1) { + pthread_create(&(the_gpgpusim->g_simulation_thread), NULL, + gpgpu_sim_thread_concurrent, (void *)this); + } else { + pthread_create(&(the_gpgpusim->g_simulation_thread), NULL, + gpgpu_sim_thread_sequential, (void *)this); + } } } } @@ -266,8 +381,13 @@ void gpgpu_context::print_simulation_time() { const unsigned cycles_per_sec = (unsigned)(the_gpgpusim->g_the_gpu->gpu_tot_sim_cycle / difference); printf("gpgpu_simulation_rate = %u (cycle/sec)\n", cycles_per_sec); - printf("gpgpu_silicon_slowdown = %ux\n", - the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec); + + if (cycles_per_sec == 0) { + printf("gpgpu_silicon_slowdown = Nan\n"); + } else { + printf("gpgpu_silicon_slowdown = %ux\n", + the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec); + } fflush(stdout); } diff --git a/src/stream_manager.cc b/src/stream_manager.cc index 72f8bb0b2..b974791d0 100644 --- a/src/stream_manager.cc +++ b/src/stream_manager.cc @@ -34,6 +34,12 @@ unsigned CUstream_st::sm_next_stream_uid = 0; +// SST memcpy callbacks +extern void SST_callback_memcpy_H2D_done(); +extern void SST_callback_memcpy_D2H_done(); +extern void SST_callback_memcpy_to_symbol_done(); +extern void SST_callback_memcpy_from_symbol_done(); + CUstream_st::CUstream_st() { m_pending = false; m_uid = sm_next_stream_uid++; @@ -122,11 +128,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { if (g_debug_execution >= 3) printf("memcpy host-to-device\n"); gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt); m_stream->record_next_done(); + if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done(); break; case stream_memcpy_device_to_host: if (g_debug_execution >= 3) printf("memcpy device-to-host\n"); gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt); m_stream->record_next_done(); + if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done(); break; case stream_memcpy_device_to_device: if (g_debug_execution >= 3) printf("memcpy device-to-device\n"); @@ -138,12 +146,14 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) { gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol( m_symbol, m_host_address_src, m_cnt, m_offset, 1, gpu); m_stream->record_next_done(); + if (gpu->is_SST_mode()) SST_callback_memcpy_to_symbol_done(); break; case stream_memcpy_from_symbol: if (g_debug_execution >= 3) printf("memcpy from symbol\n"); gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol( m_symbol, m_host_address_dst, m_cnt, m_offset, 0, gpu); m_stream->record_next_done(); + if (gpu->is_SST_mode()) SST_callback_memcpy_from_symbol_done(); break; case stream_kernel_launch: if (m_sim_mode) { // Functional Sim @@ -472,7 +482,7 @@ void stream_manager::push(stream_operation op) { } if (g_debug_execution >= 3) print_impl(stdout); pthread_mutex_unlock(&m_lock); - if (m_cuda_launch_blocking || stream == NULL) { + if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL)) { unsigned int wait_amount = 100; unsigned int wait_cap = 100000; // 100ms while (!empty()) {