diff --git a/cpu/Makefile b/cpu/Makefile index 13217a7f..aeb698ac 100644 --- a/cpu/Makefile +++ b/cpu/Makefile @@ -42,7 +42,8 @@ SRC_SERVER = $(RPC_XDR) \ mt-memcpy.c \ cpu-elf2.c \ cpu-server-nvml.c \ - cpu-server-cudnn.c + cpu-server-cudnn.c \ + cpu-server-cublaslt.c SRC_SERVER_LIB = server-library.c SRC_SERVER_EXE = server-exe.c @@ -62,7 +63,8 @@ SRC_CLIENT = $(RPC_XDR) \ cpu-elf2.c \ cpu-client-nvml.c \ cpu-client-cudnn.c \ - cpu-client-cublas.c + cpu-client-cublas.c \ + cpu-client-cublaslt.c # cpu-client-driver-hidden.c \ @@ -110,7 +112,7 @@ ifdef WITH_IB CC_FLAGS += -DWITH_IB=$(WITH_IB) endif -SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn +SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn -lcublasLt SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files CLIENT_LD_FLAGS = $(LD_FLAGS) diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c index f9fbc159..f385c1c6 100644 --- a/cpu/cpu-client-cublas.c +++ b/cpu/cpu-client-cublas.c @@ -31,7 +31,7 @@ cublasStatus_t cublasCreate_v2(cublasHandle_t* handle) clnt_perror (clnt, "call failed"); } if (result.err == 0) { - *handle = (void*)result.ptr_result_u.ptr; + *handle = (cublasHandle_t)result.ptr_result_u.ptr; } return result.err; } @@ -93,7 +93,28 @@ DEF_FN(cublasStatus_t, cublasGetPointerMode_v2, cublasHandle_t, handle, cublasPo DEF_FN(cublasStatus_t, cublasSetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t, mode); DEF_FN(cublasStatus_t, cublasGetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t*, mode); DEF_FN(cublasStatus_t, cublasSetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t, mode); -DEF_FN(cublasStatus_t, cublasGetMathMode, cublasHandle_t, handle, cublasMath_t*, mode); + +cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasgetmathmode_1( + (ptr)handle, + &result, clnt + ); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *mode = result.int_result_u.data; + } + return result.err; +} + cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) { #ifdef WITH_API_CNT @@ -605,7 +626,6 @@ cublasStatus_t cublasSgemmEx(cublasHandle_t handle, DEF_FN(cublasStatus_t, cublasSgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); -DEF_FN(cublasStatus_t, cublasGemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); DEF_FN(cublasStatus_t, cublasGemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); DEF_FN(cublasStatus_t, cublasCgemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc); DEF_FN(cublasStatus_t, cublasCgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); @@ -691,7 +711,6 @@ DEF_FN(cublasStatus_t, cublasCgemm3mBatched, cublasHandle_t, handle, cublasOpera DEF_FN(cublasStatus_t, cublasCgemm3mBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*, Aarray, int64_t, lda, const cuComplex* const*, Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*, Carray, int64_t, ldc, int64_t, batchCount); DEF_FN(cublasStatus_t, cublasZgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int, lda, const cuDoubleComplex* const*, Barray, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*, Carray, int, ldc, int, batchCount); DEF_FN(cublasStatus_t, cublasZgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int64_t, lda, const cuDoubleComplex* const*, Barray, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*, Carray, int64_t, ldc, int64_t, batchCount); -DEF_FN(cublasStatus_t, cublasSgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, B, int, ldb, long long int, strideB, const float*, beta, float*, C, int, ldc, long long int, strideC, int, batchCount); DEF_FN(cublasStatus_t, cublasSgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, B, int64_t, ldb, long long int, strideB, const float*, beta, float*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); DEF_FN(cublasStatus_t, cublasDgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, B, int, ldb, long long int, strideB, const double*, beta, double*, C, int, ldc, long long int, strideC, int, batchCount); DEF_FN(cublasStatus_t, cublasDgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, B, int64_t, ldb, long long int, strideB, const double*, beta, double*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); @@ -703,7 +722,6 @@ DEF_FN(cublasStatus_t, cublasZgemmStridedBatched, cublasHandle_t, handle, cublas DEF_FN(cublasStatus_t, cublasZgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, B, int64_t, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); DEF_FN(cublasStatus_t, cublasGemmBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void* const*, Aarray, cudaDataType, Atype, int, lda, const void* const*, Barray, cudaDataType, Btype, int, ldb, const void*, beta, void* const*, Carray, cudaDataType, Ctype, int, ldc, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); DEF_FN(cublasStatus_t, cublasGemmBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void* const*, Aarray, cudaDataType, Atype, int64_t, lda, const void* const*, Barray, cudaDataType, Btype, int64_t, ldb, const void*, beta, void* const*, Carray, cudaDataType, Ctype, int64_t, ldc, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); -DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, long long int, strideC, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int64_t, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, long long int, strideC, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); DEF_FN(cublasStatus_t, cublasSgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, beta, const float*, B, int, ldb, float*, C, int, ldc); DEF_FN(cublasStatus_t, cublasSgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, const float*, B, int64_t, ldb, float*, C, int64_t, ldc); @@ -761,3 +779,152 @@ DEF_FN(cublasStatus_t, cublasSgetrsBatched, cublasHandle_t, handle, cublasOperat DEF_FN(cublasStatus_t, cublasDgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const double* const*, Aarray, int, lda, const int*, devIpiv, double* const*, Barray, int, ldb, int*, info, int, batchSize); DEF_FN(cublasStatus_t, cublasCgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuComplex* const*, Aarray, int, lda, const int*, devIpiv, cuComplex* const*, Barray, int, ldb, int*, info, int, batchSize); DEF_FN(cublasStatus_t, cublasZgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuDoubleComplex* const*, Aarray, int, lda, const int*, devIpiv, cuDoubleComplex* const*, Barray, int, ldb, int*, info, int, batchSize); + +cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + cudaDataType_t Atype, + int lda, + long long int strideA, + const void *B, + cudaDataType_t Btype, + int ldb, + long long int strideB, + const void *beta, + void *C, + cudaDataType_t Ctype, + int ldc, + long long int strideC, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasgemmstridedbatchedex_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *((float*)alpha), + (ptr)A, + (int)Atype, + lda, + strideA, + (ptr)B, + (int)Btype, + ldb, + strideB, + *((float*)beta), + (ptr)C, + (int)Ctype, + ldc, + strideC, + batchCount, + (int)computeType, + (int)algo, + &result, clnt + ); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + + +cublasStatus_t cublasGemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + cudaDataType_t Atype, + int lda, + const void *B, + cudaDataType_t Btype, + int ldb, + const void *beta, + void *C, + cudaDataType_t Ctype, + int ldc, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasgemmex_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *((float*)alpha), + (ptr)A, (int)Atype, lda, + (ptr)B, (int)Btype, ldb, + *((float*)beta), + (ptr)C, (int)Ctype, ldc, + computeType, algo, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + + +cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *A, int lda, + long long int strideA, + const float *B, int ldb, + long long int strideB, + const float *beta, + float *C, int ldc, + long long int strideC, + int batchCount) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasgemmstridedbatched_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *alpha, + (ptr)A, + lda, + strideA, + (ptr)B, + ldb, + strideB, + *beta, + (ptr)C, + ldc, + strideC, + batchCount, + &result, clnt + ); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} diff --git a/cpu/cpu-client-cublaslt.c b/cpu/cpu-client-cublaslt.c new file mode 100644 index 00000000..820b63dd --- /dev/null +++ b/cpu/cpu-client-cublaslt.c @@ -0,0 +1,295 @@ +#define _GNU_SOURCE +#include +#include +#include + +//for strerror +#include +#include + +#include "cpu-libwrap.h" +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" + +#ifdef WITH_API_CNT +extern int api_call_cnt; +#endif //WITH_API_CNT + +cublasStatus_t cublasLtCreate(cublasLtHandle_t *lighthandle) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltcreate_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *lighthandle = (cublasLtHandle_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cublasStatus_t cublasLtMatrixLayoutCreate( cublasLtMatrixLayout_t *matLayout, + cudaDataType type, + uint64_t rows, + uint64_t cols, + int64_t ld) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + ptr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatrixlayoutcreate_1( + type, + rows, + cols, + ld, + &result, + clnt + ); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *matLayout = (void*)result.ptr_result_u.ptr; + } + return result.err; +} + +cublasStatus_t cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + ptr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmulpreferencecreate_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *pref = (void*)result.ptr_result_u.ptr; + } + return result.err; +} + + +cublasStatus_t cublasLtMatmulDescCreate( cublasLtMatmulDesc_t *matmulDesc, + cublasComputeType_t computeType, + cudaDataType_t scaleType) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + ptr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmuldesccreate_1( + computeType, + scaleType, + &result, + clnt + ); + + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *matmulDesc = (cublasLtMatmulDesc_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cublasStatus_t cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmuldescdestroy_1((ptr)matmulDesc, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + + +cublasStatus_t cublasLtMatmulAlgoGetHeuristic( + cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t operationDesc, + cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, + cublasLtMatrixLayout_t Ddesc, + cublasLtMatmulPreference_t preference, + int requestedAlgoCount, + cublasLtMatmulHeuristicResult_t heuristicResultsArray[], + int* returnAlgoCount) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + matmul_hr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmulalgogetheuristic_1( + (ptr)lightHandle, + (ptr)operationDesc, + (ptr)Adesc, + (ptr)Bdesc, + (ptr)Cdesc, + (ptr)Ddesc, + (ptr)preference, + requestedAlgoCount, + &result, + clnt + ); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != 0) { + return result.err; + } + + *returnAlgoCount = result.matmul_hr_result_u.data.s; + if (memcpy(heuristicResultsArray, result.matmul_hr_result_u.data.p, 96) == NULL) { + LOGE(LOG_ERROR, "error: matmul hr alloc"); + return result.err; + } + + return result.err; +} + +cublasStatus_t cublasLtMatmulPreferenceSetAttribute( + cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, + const void *buf, + size_t sizeInBytes) +{ + return 0; +} + +cublasStatus_t cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmulpreferencedestroy_1((ptr)pref, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +cublasStatus_t cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatrixlayoutdestroy_1((ptr)matLayout, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +cublasStatus_t cublasLtMatmulDescSetAttribute( + cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, + const void *buf, + size_t sizeInBytes) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + mem_data data = { + .mem_data_len = sizeInBytes, + .mem_data_val = (char *)buf + }; + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmuldescsetattribute_1( + (ptr)matmulDesc, + attr, + data, + &result, + clnt + ); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + + return result; +} + + +cublasStatus_t cublasLtMatmul( + cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, + const void *alpha, + const void *A, + cublasLtMatrixLayout_t Adesc, + const void *B, + cublasLtMatrixLayout_t Bdesc, + const void *beta, + const void *C, + cublasLtMatrixLayout_t Cdesc, + void *D, + cublasLtMatrixLayout_t Ddesc, + const cublasLtMatmulAlgo_t *algo, + void *workspace, + size_t workspaceSizeInBytes, + cudaStream_t stream) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasltmatmul_1( + (ptr)lightHandle, + (ptr)computeDesc, + *((float*)alpha), + (ptr)A, + (ptr)Adesc, + (ptr)B, + (ptr)Bdesc, + *((float*)beta), + (ptr)C, + (ptr)Cdesc, + (ptr)D, + (ptr)Ddesc, + (ptr)algo, + (ptr)workspace, + workspaceSizeInBytes, + (ptr)stream, + &result, + clnt + ); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c index 05136fe2..02d583c2 100644 --- a/cpu/cpu-client-cudnn.c +++ b/cpu/cpu-client-cudnn.c @@ -39,6 +39,7 @@ size_t cudnnGetMaxDeviceVersion(void) } return result; } + size_t cudnnGetCudartVersion(void) { #ifdef WITH_API_CNT @@ -1402,15 +1403,58 @@ cudnnStatus_t cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t con } return result; } -DEF_FN(cudnnStatus_t, cudnnSetConvolutionMathType, cudnnConvolutionDescriptor_t, convDesc, cudnnMathType_t, mathType) DEF_FN(cudnnStatus_t, cudnnGetConvolutionMathType, cudnnConvolutionDescriptor_t, convDesc, cudnnMathType_t*, mathType) -DEF_FN(cudnnStatus_t, cudnnSetConvolutionGroupCount, cudnnConvolutionDescriptor_t, convDesc, int, groupCount) DEF_FN(cudnnStatus_t, cudnnGetConvolutionGroupCount, cudnnConvolutionDescriptor_t, convDesc, int*, groupCount) DEF_FN(cudnnStatus_t, cudnnSetConvolutionReorderType, cudnnConvolutionDescriptor_t, convDesc, cudnnReorderType_t, reorderType) DEF_FN(cudnnStatus_t, cudnnGetConvolutionReorderType, cudnnConvolutionDescriptor_t, convDesc, cudnnReorderType_t*, reorderType) DEF_FN(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, convDesc, int, pad_h, int, pad_w, int, u, int, v, int, dilation_h, int, dilation_w, cudnnConvolutionMode_t, mode, cudnnDataType_t, computeType) DEF_FN(cudnnStatus_t, cudnnGetConvolution2dDescriptor, const cudnnConvolutionDescriptor_t, convDesc, int*, pad_h, int*, pad_w, int*, u, int*, v, int*, dilation_h, int*, dilation_w, cudnnConvolutionMode_t*, mode, cudnnDataType_t*, computeType) - + +cudnnStatus_t cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) +{ + #ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetconvolutionmathtype_1( + (ptr)convDesc, + mathType, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) +{ + #ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetconvolutiongroupcount_1( + (ptr)convDesc, + groupCount, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + + cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int* padA, const int* filterStrideA, const int* dilationA, cudnnConvolutionMode_t mode, cudnnDataType_t computeType) { #ifdef WITH_API_CNT @@ -1851,4 +1895,4 @@ cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); } return result; -} \ No newline at end of file +} diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c index c054965c..15068909 100644 --- a/cpu/cpu-client.c +++ b/cpu/cpu-client.c @@ -220,7 +220,7 @@ void __attribute__((destructor)) deinit_rpc(void) } if (clnt != NULL) { - clnt_destroy(clnt); + clnt_destroy(clnt); } } @@ -267,7 +267,7 @@ void *dlopen(const char *filename, int flag) } /* filename is NULL or not in replace_libs list */ if ((ret = dlopen_orig(filename, flag)) == NULL) { - LOGE(LOG_ERROR, "dlopen failed: ", dlerror()); + LOGE(LOG_ERROR, "dlopen %s failed: ", filename, dlerror()); } else if (has_kernel) { dlinfo(ret, RTLD_DI_LINKMAP, &map); LOGE(LOG_DEBUG, "dlopen to %p", map->l_addr); @@ -418,4 +418,4 @@ void __cudaUnregisterFatBinary(void **fatCubinHandle) // &result, clnt); if (retval_1 != RPC_SUCCESS) { // clnt_perror (clnt, "call failed"); // } -// } \ No newline at end of file +// } diff --git a/cpu/cpu-libwrap.c b/cpu/cpu-libwrap.c index 28ae0217..96c61c1e 100644 --- a/cpu/cpu-libwrap.c +++ b/cpu/cpu-libwrap.c @@ -3,7 +3,7 @@ #include "cpu-libwrap.h" #include "log.h" -static const char* LIBCUDA_PATH; +static const char* LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so"; static void *so_handle; inline void* libwrap_get_sohandle() diff --git a/cpu/cpu-libwrap.h b/cpu/cpu-libwrap.h index 5b3a8ba7..50a5ae87 100644 --- a/cpu/cpu-libwrap.h +++ b/cpu/cpu-libwrap.h @@ -21,8 +21,8 @@ void libwrap_post_call(char *ret, char *name, char *parameters); } \ #define DEF_FN_BODY(RET, NAME, P_NAMES...) \ + LOG(LOG_DEBUG, "%s call", #NAME); \ DEF_DLSYM(RET, NAME) \ - LOG(LOG_DEBUG, "%s called", #NAME); \ CAL_FN_PTR(P_NAMES); \ LOG(LOG_DEBUG, "%s finished", #NAME); \ return ret; diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c index 2fe5cdfa..a6ec1260 100644 --- a/cpu/cpu-server-cublas.c +++ b/cpu/cpu-server-cublas.c @@ -46,7 +46,9 @@ bool_t rpc_cublascreate_1_svc(ptr_result *result, struct svc_req *rqstp) GSCHED_RETAIN; result->err = cublasCreate_v2((cublasHandle_t*)&result->ptr_result_u.ptr); - resource_mg_create(&rm_cublas, (void*)result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cublas, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } GSCHED_RELEASE; RECORD_RESULT(ptr_result_u, *result); @@ -300,4 +302,122 @@ bool_t rpc_cublassgemmex_1_svc(ptr handle, int transa, int transb, int m, int n, GSCHED_RELEASE; RECORD_RESULT(integer, *result); return 1; -} \ No newline at end of file +} + +bool_t rpc_cublasgetmathmode_1_svc(ptr handle, int_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result-> err = cublasGetMathMode( + (cublasHandle_t)resource_mg_get(&rm_cublas, (void*)handle), + (cublasMath_t*)&result->int_result_u.data + ); + GSCHED_RELEASE; + return 1; +} + + +bool_t rpc_cublasgemmstridedbatchedex_1_svc( + ptr handle, + int transa, + int transb, + int m, int n, int k, + float alpha, + ptr A, + int Atype, + int lda, + ll strideA, + ptr B, + int Btype, + int ldb, + ll strideB, + float beta, + ptr C, + int Ctype, + int ldc, + ll strideC, + int batchCount, + int computeType, + int algo, + int *result, struct svc_req *rqstp +) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cublasGemmStridedBatchedEx( + (cublasHandle_t)resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) transa, + (cublasOperation_t) transb, + m, n, k, &alpha, + resource_mg_get(&rm_memory, (void*)A), (cudaDataType_t)Atype, lda, (long long int)strideA, + resource_mg_get(&rm_memory, (void*)B), (cudaDataType_t)Btype, ldb, (long long int)strideB, + &beta, + resource_mg_get(&rm_memory, (void*)C), (cudaDataType_t)Ctype, ldc, (long long int)strideC, + batchCount, + (cublasComputeType_t)computeType, + (cublasGemmAlgo_t)algo + ); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasgemmex_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha, + ptr A, int Atype, int lda, + ptr B, int Btype, int ldb, float beta, + ptr C, int Ctype, int ldc, + int computeType, int algo, + int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasGemmEx"); + GSCHED_RETAIN; + *result = cublasGemmEx(resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) transa, + (cublasOperation_t) transb, + m, n, k, &alpha, + resource_mg_get(&rm_memory, (void*)A), (cudaDataType_t)Atype, lda, + resource_mg_get(&rm_memory, (void*)B), (cudaDataType_t)Btype, ldb, &beta, + resource_mg_get(&rm_memory, (void*)C), (cudaDataType_t)Ctype, ldc, + (cublasComputeType_t)computeType, + (cublasGemmAlgo_t)algo + ); + GSCHED_RELEASE; + return 1; +} + + +bool_t rpc_cublasgemmstridedbatched_1_svc( + ptr handle, + int transa, + int transb, + int m, int n, int k, + float alpha, + ptr A, + int lda, + ll strideA, + ptr B, + int ldb, + ll strideB, + float beta, + ptr C, + int ldc, + ll strideC, + int batchCount, + int *result, struct svc_req *rqstp +) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cublasSgemmStridedBatched( + (cublasHandle_t)resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) transa, + (cublasOperation_t) transb, + m, n, k, &alpha, + resource_mg_get(&rm_memory, (void*)A), lda, (long long int)strideA, + resource_mg_get(&rm_memory, (void*)B), ldb, (long long int)strideB, + &beta, + resource_mg_get(&rm_memory, (void*)C), ldc, (long long int)strideC, + batchCount + ); + GSCHED_RELEASE; + return 1; +} diff --git a/cpu/cpu-server-cublaslt.c b/cpu/cpu-server-cublaslt.c new file mode 100644 index 00000000..d52e998c --- /dev/null +++ b/cpu/cpu-server-cublaslt.c @@ -0,0 +1,211 @@ +#include +#include +#include +#include +#include + +//for strerror +#include +#include + +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" +#include "resource-mg.h" +#define WITH_RECORDER +#include "api-recorder.h" +#include "cpu-server-cublas.h" +#include "gsched.h" + + +int cublaslt_init(int bypass, resource_mg *memory) +{ + int ret = 0; + ret &= resource_mg_init(&rm_cublaslt, bypass); + return ret; +} + +int cublaslt_deinit(void) +{ + resource_mg_free(&rm_cublaslt); + return 0; +} + +bool_t rpc_cublasltcreate_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtCreate"); + + GSCHED_RETAIN; + result->err = cublasLtCreate((cublasLtHandle_t*)&result->ptr_result_u.ptr); + resource_mg_create(&rm_cublaslt, (void*)result->ptr_result_u.ptr); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatrixlayoutcreate_1_svc(int type, uint64_t row, uint64_t cols, int64_t ld, ptr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulLayoutCreate"); + + GSCHED_RETAIN; + result-> err = cublasLtMatrixLayoutCreate( + (cublasLtMatrixLayout_t*)&result->ptr_result_u.ptr, + (cudaDataType)type, + row, + cols, + ld + ); + if (resource_mg_create(&rm_cublaslt, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmulpreferencecreate_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulPreferenceCreate"); + + GSCHED_RETAIN; + result-> err = cublasLtMatmulPreferenceCreate( + (cublasLtMatmulPreference_t*)&result->ptr_result_u.ptr + ); + if (resource_mg_create(&rm_cublaslt, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmuldesccreate_1_svc(int computeType, int scaleType, ptr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulDescCreate"); + + GSCHED_RETAIN; + result->err = cublasLtMatmulDescCreate((cublasLtMatmulDesc_t*)&result->ptr_result_u.ptr, + (cublasComputeType_t)computeType, + (cudaDataType_t)scaleType); + if (resource_mg_create(&rm_cublaslt, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmuldescdestroy_1_svc(ptr matmulDesc, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulDescDestroy"); + + GSCHED_RETAIN; + *result = cublasLtMatmulDescDestroy(resource_mg_get(&rm_cublaslt, (void*)matmulDesc)); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmulpreferencedestroy_1_svc(ptr pref, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulPreferenceDestroy"); + + GSCHED_RETAIN; + *result = cublasLtMatmulPreferenceDestroy(resource_mg_get(&rm_cublaslt, (void*)pref)); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatrixlayoutdestroy_1_svc(ptr matLayout, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatrixLayoutDestroy"); + + GSCHED_RETAIN; + *result = cublasLtMatrixLayoutDestroy(resource_mg_get(&rm_cublaslt, (void*)matLayout)); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmulalgogetheuristic_1_svc(ptr lightHandle, ptr operationDesc, ptr Adesc, ptr Bdesc, ptr Cdesc, ptr Ddesc, ptr preference, int requestedAlgoCount, matmul_hr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulAlgoGetHeuristic"); + GSCHED_RETAIN; + + if (sizeof(result->matmul_hr_result_u.data.p) != sizeof(cublasLtMatmulHeuristicResult_t)) { + LOGE(LOG_ERROR, "cublasLtMatmulHeuristicResult_t size mismatch"); + return 0; + } + + result->err = cublasLtMatmulAlgoGetHeuristic( + (cublasLtHandle_t)resource_mg_get(&rm_cublaslt, (void*)lightHandle), + (cublasLtMatmulDesc_t)resource_mg_get(&rm_cublaslt, (void*)operationDesc), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Adesc), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Bdesc), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Cdesc), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Ddesc), + (cublasLtMatmulPreference_t)resource_mg_get(&rm_cublaslt, (void*)preference), + requestedAlgoCount, + (void*)&result->matmul_hr_result_u.data.p, + &result->matmul_hr_result_u.data.s); + + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cublasltmatmuldescsetattribute_1_svc(ptr matmulDesc, int attr, mem_data data, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmulDescSetAttribute"); + GSCHED_RETAIN; + + *result = cublasLtMatmulDescSetAttribute( + (cublasLtMatmulDesc_t)resource_mg_get(&rm_cublaslt, (void*)matmulDesc), + (cublasLtMatmulDescAttributes_t)attr, + data.mem_data_val, + data.mem_data_len + ); + + GSCHED_RELEASE; + return 1; +} + + +bool_t rpc_cublasltmatmul_1_svc(ptr lightHandle, + ptr computeDesc, + float alpha, + ptr A, + ptr Adesc, + ptr B, + ptr Bdesc, + float beta, + ptr C, + ptr Cdesc, + ptr D, + ptr Ddesc, + ptr algo, + ptr workspace, + size_t workspaceSizeInBytes, + ptr stream, + int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cublasLtMatmul"); + GSCHED_RETAIN; + + *result = cublasLtMatmul( + (cublasLtHandle_t)resource_mg_get(&rm_cublaslt, (void*)lightHandle), + (cublasLtMatmulDesc_t)resource_mg_get(&rm_cublaslt, (void*)computeDesc), + &alpha, + resource_mg_get(&rm_memory, (void*)A), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Adesc), + resource_mg_get(&rm_memory, (void*)B), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Bdesc), + &beta, + resource_mg_get(&rm_memory, (void*)C), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Cdesc), + resource_mg_get(&rm_memory, (void*)D), + (cublasLtMatrixLayout_t)resource_mg_get(&rm_cublaslt, (void*)Ddesc), + // (const cublasLtMatmulAlgo_t *)algo, + NULL, + resource_mg_get(&rm_memory, (void*)workspace), + workspaceSizeInBytes, + (cudaStream_t)resource_mg_get(&rm_streams, (void*)stream) + ); + + GSCHED_RELEASE; + return 1; +} diff --git a/cpu/cpu-server-cublaslt.h b/cpu/cpu-server-cublaslt.h new file mode 100644 index 00000000..8afa1c27 --- /dev/null +++ b/cpu/cpu-server-cublaslt.h @@ -0,0 +1,10 @@ +#ifndef _CPU_SERVER_CUBLASLT_H_ +#define _CPU_SERVER_CUBLASLT_H_ + +#include "resource-mg.h" + +int cublaslt_init(int restore, resource_mg *memory); +int cublaslt_deinit(void); +resource_mg *cublaslt_get_rm(void); + +#endif // _CPU_SERVER_CUBLASLT_H_ diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c index 70e4abce..d51adb0f 100644 --- a/cpu/cpu-server-cudnn.c +++ b/cpu/cpu-server-cudnn.c @@ -1031,6 +1031,41 @@ bool_t rpc_cudnndestroyconvolutiondescriptor_1_svc(ptr convDesc, int *result, st return 1; } +bool_t rpc_cudnnsetconvolutionmathtype_1_svc(ptr convDesc, int mathType, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetconvolutionmathtype_1_argument); + RECORD_NARG(convDesc); + RECORD_NARG(mathType); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetConvolutionMathType( + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + (cudnnMathType_t)mathType); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + + +bool_t rpc_cudnnsetconvolutiongroupcount_1_svc(ptr convDesc, int groupCount, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetconvolutiongroupcount_1_argument); + RECORD_NARG(convDesc); + RECORD_NARG(groupCount); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetConvolutionGroupCount( + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + groupCount); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + bool_t rpc_cudnnsetconvolutionnddescriptor_1_svc(ptr convDesc, int arrayLength, mem_data padA, mem_data filterStrideA, mem_data dilationA, int mode, int computeType, int *result, struct svc_req *rqstp) { RECORD_API(rpc_cudnnsetconvolutionnddescriptor_1_argument); @@ -1393,4 +1428,4 @@ bool_t rpc_cudnnbackendexecute_1_svc(ptr handle, ptr executionPlan, ptr variantP GSCHED_RELEASE; RECORD_RESULT(integer, *result); return 1; -} \ No newline at end of file +} diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c index 92916d2b..6791e839 100644 --- a/cpu/cpu-server-runtime.c +++ b/cpu/cpu-server-runtime.c @@ -33,6 +33,7 @@ #include "cr.h" #include "cpu-server-cusolver.h" #include "cpu-server-cublas.h" +#include "cpu-server-cublaslt.h" #include "mt-memcpy.h" typedef struct host_alloc_info { @@ -74,6 +75,7 @@ int server_runtime_init(int restore) ret &= resource_mg_init(&rm_memory, 1); ret &= cusolver_init(1, &rm_streams, &rm_memory); ret &= cublas_init(1, &rm_memory); + ret &= cublaslt_init(1, &rm_memory); } else { ret &= resource_mg_init(&rm_streams, 0); ret &= resource_mg_init(&rm_events, 0); @@ -83,6 +85,7 @@ int server_runtime_init(int restore) ret &= cusolver_init(0, &rm_streams, &rm_memory); ret &= cublas_init(0, &rm_memory); ret &= server_runtime_restore("ckp"); + ret &= cublaslt_init(0, &rm_memory); } // Make sure runtime API is initialized @@ -364,7 +367,7 @@ bool_t cuda_get_device_properties_1_svc(int device, cuda_device_prop_result *res { LOGE(LOG_DEBUG, "cudaGetDeviceProperties"); if (sizeof(result->cuda_device_prop_result_u.data) != sizeof(struct cudaDeviceProp)) { - LOGE(LOG_ERROR, "cuda_device_prop_result size mismatch"); + LOGE(LOG_ERROR, "cuda_device_prop_result size mismatch, result %d prop %d", sizeof(result->cuda_device_prop_result_u.data), sizeof(struct cudaDeviceProp)); return 0; } result->err = cudaGetDeviceProperties((void*)result->cuda_device_prop_result_u.data, device); diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x index 6d505842..df6fcda5 100644 --- a/cpu/cpu_rpc_prot.x +++ b/cpu/cpu_rpc_prot.x @@ -2,7 +2,9 @@ typedef opaque mem_data<>; typedef unsigned hyper size_t; typedef unsigned hyper ptr; +typedef hyper ll; typedef opaque rpc_cuda_device_prop[1032]; +typedef opaque rpc_matmul_heuristic_result[96]; struct dint { int i1; @@ -19,6 +21,11 @@ struct ptrsz { size_t s; }; +struct matmul_hr { + rpc_matmul_heuristic_result p; + int s; +}; + struct cuda_channel_format_desc { int f; int w; @@ -135,6 +142,13 @@ default: void; }; +union matmul_hr_result switch (int err) { +case 0: + matmul_hr data; +default: + void; +}; + /* memory allocated for RPC. */ /* Freed rpc_cd_prog_1_freeresult by after RPC. */ union mem_result switch (int err) { @@ -570,5 +584,21 @@ program RPC_CD_PROG { int attributeType, hyper requestedElementCount) = 5314; int rpc_cudnnBackendExecute(ptr handle, ptr executionPlan, ptr variantPack) = 5315; + int rpc_cudnnSetConvolutionGroupCount(ptr convDesc, int groupCount) = 5316; + int rpc_cudnnsetconvolutionmathtype(ptr convDesc, int mathType) = 5317; + ptr_result rpc_cublasltcreate(void) = 5318; + ptr_result rpc_cublasltmatmuldesccreate(int computeType, int scaleType) = 5319; + matmul_hr_result rpc_cublasltmatmulalgogetheuristic(ptr handle, ptr operationDesc, ptr aDesc, ptr bDesc, ptr cDesc, ptr dDesc, ptr preference, int requestedAlgoCount) = 5320; + int rpc_cublasltmatmuldescsetattribute(ptr matmulDesc, int attr, mem_data data) = 5321; + int rpc_cublasltmatmuldescdestroy(ptr matmulDesc) = 5322; + ptr_result rpc_cublasltmatrixlayoutcreate(int type, uint64_t row, uint64_t cols, int64_t ld) = 5323; + ptr_result rpc_cublasltmatmulpreferencecreate(void) = 5324; + int rpc_cublasltmatmulpreferencedestroy(ptr pref) = 5325; + int rpc_cublasltmatrixlayoutdestroy(ptr matLayout) = 5326; + int rpc_cublasltmatmul(ptr lightHandle,ptr computeDesc,float alpha,ptr A,ptr Adesc,ptr B,ptr Bdesc,float beta,ptr C,ptr Cdesc,ptr D,ptr Ddesc,ptr algo,ptr workspace,size_t workspaceSizeInBytes,ptr stream) = 5327; + int_result rpc_cublasgetmathmode(ptr handle) = 5328; + int rpc_cublasgemmstridedbatchedex(ptr handle, int transa, int transb, int m,int n,int k,float alpha,ptr A, int Atype, int lda, ll strideA,ptr B,int Btype,int ldb, ll strideB,float beta, ptr C,int Ctype,int ldc, ll strideC,int batchCount,int computeType,int algo) = 5329; + int rpc_cublasgemmex(ptr, int, int, int, int, int, float,ptr, int, int, ptr, int, int, float, ptr, int, int, int, int) = 5330; + int rpc_cublasgemmstridedbatched(ptr handle, int transa, int transb, int m,int n,int k,float alpha,ptr A, int lda, ll strideA,ptr B,int ldb, ll strideB,float beta, ptr C,int ldc, ll strideC,int batchCount) = 5331; } = 1; } = 99; diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h index ee8c44fa..53d24705 100644 --- a/cpu/resource-mg.h +++ b/cpu/resource-mg.h @@ -38,6 +38,8 @@ resource_mg rm_globals; //Other RMs resource_mg rm_cusolver; resource_mg rm_cublas; +resource_mg rm_cublaslt; + //CUDNN RMs resource_mg rm_cudnn; diff --git a/gpu/src/main.c b/gpu/src/main.c index 7facb092..1f4ec394 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -212,7 +212,7 @@ int cricket_restore(int argc, char *argv[]) gdb_init(argc, argv, (char*)patched_binary, NULL); LOGE(LOG_DEBUG, "GDB init"); - execute_command("set exec-wrapper env 'LD_PRELOAD=/home/eiling/projects/cricket/cpu/cricket-server.so' 'CRICKET_RESTORE=1'", !batch_flag); + execute_command("set exec-wrapper env 'LOG=DEBUG' 'LD_PRELOAD=/opt/cricket/bin/cricket-server.so' 'CRICKET_RESTORE=1'", !batch_flag); // load the patched binary //exec_file_attach(patched_binary, !batch_flag); @@ -1106,7 +1106,7 @@ int cricket_start(int argc, char *argv[]) /* load files */ //exec_file_attach(argv[2], !batch_flag); // - execute_command("set exec-wrapper env 'LD_PRELOAD=/home/eiling/projects/cricket/bin/libtirpc.so.3:/home/eiling/projects/cricket/cpu/cricket-server.so'", !batch_flag); + execute_command("set exec-wrapper env 'LD_PRELOAD=/opt/cricket/bin/libtirpc.so.3:/opt/cricket/bin/cricket-server.so' 'LOG=DEBUG'", !batch_flag); //execute_command("break main", !batch_flag); execute_command("starti", !batch_flag); //execute_command("unset exec-wrapper", !batch_flag); diff --git a/tests/gpu/checkpoint.sh b/tests/gpu/checkpoint.sh index 2624b66b..e233a88d 100755 --- a/tests/gpu/checkpoint.sh +++ b/tests/gpu/checkpoint.sh @@ -52,4 +52,3 @@ ${CRICKET_BIN} checkpoint $server_pid sleep 2 kill $server_pid -