From 9267d33fc397d162ffa41e334b1dcc8eb6e993bd Mon Sep 17 00:00:00 2001 From: vandoor <41376266+vandoor@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:07:08 +0800 Subject: [PATCH] 1 1 --- DMM.h | 223 ++++++++++++++++++++++++++++++++++++++++ SpMM.h | 153 +++++++++++++++++++++++++++ cusparse_matmul.h | 256 ++++++++++++++++++++++++++++++++++++++++++++++ gen.cpp | 59 +++++++++++ test.cpp | 164 +++++++++++++++++++++++++++++ timedef.h | 5 + 6 files changed, 860 insertions(+) create mode 100644 DMM.h create mode 100644 SpMM.h create mode 100644 cusparse_matmul.h create mode 100644 gen.cpp create mode 100644 test.cpp create mode 100644 timedef.h diff --git a/DMM.h b/DMM.h new file mode 100644 index 0000000..8a5ef86 --- /dev/null +++ b/DMM.h @@ -0,0 +1,223 @@ +#pragma once +#include "timedef.h" +#include + +typedef unsigned int uint; +template +struct DenseMatrix { + int R, C; + FLOAT_T* v; + DenseMatrix(int R, int C) :R(R), C(C) { + v = (FLOAT_T*)malloc(R * C * sizeof(FLOAT_T)); + for (int i = 0; i < R * C; i++) { + v[i] = randreal(); + // v[i]=(FLOAT_T)1; + // v[i]=(i/C); + } + } + void del() { + free(v); + } + void clear() { + for (int i = 0; i < R * C; i++) v[i] = 0; + } + FLOAT_T* operator[](int i) { + return v + i * C; + } + void println() { + cout << "["; + for (int i = 0; i < R; i++) { + cout << '['; + for (int j = 0; j < C; j++) { + cout << v[i * C + j]; + if (j == C - 1) { + if (i == R - 1) cout << "]]\n"; + else cout << "],\n"; + } + else cout << ","; + } + } + cout << "--------------------------------------\n"; + } + void print() { + cout << "["; + for (int i = 0; i < R; i++) { + cout << '['; + for (int j = 0; j < C; j++) { + cout << v[i * C + j]; + if (j == C - 1) { + if (i == R - 1) cout << "]]"; + else cout << "],"; + } + else cout << ","; + } + } + cout << "--------------------------------------\n"; + } + void printErr() { + FLOAT_T s = 0; + for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) { + s += fabs(v[i * C + j]); + } + cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n'; + } + void printErr(const DenseMatrix& d) { + FLOAT_T s = 0; + for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) { + s += fabs(v[i * C + j] - d.v[i * C + j]); + } + cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n'; + } +}; + +__global__ void DenseMatMul(double* a, double* b, double* c, int n, int k, int m) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + if (i < n && j < m) { + double* val = c + i * m + j; + *val = 0; + for (int l = 0; l < k; l++) { + *val += a[i * k + l] * b[l * m + j]; + } + } +} + + +template +__global__ void DenseMatMulBlock(double* a, double* b, double* c, int n, int k, int m) { + int blockX = blockIdx.x; + int blockY = blockIdx.y; + int lux = blockX * block_size; + int luy = blockY * block_size; + + int totala = n * k; + int totalb = k * m; + int ti = threadIdx.x, tj = threadIdx.y; + double val = 0; + + + for (int d = 0; d < k; d += block_size) { + __shared__ double As[block_size][block_size]; + __shared__ double Bs[block_size][block_size]; + + int offa = lux * k + d; + int offb = d * m + luy; + + double* astart = a + offa; + double* bstart = b + offb; + + if (offa + ti * k + tj < totala) As[ti][tj] = astart[ti * k + tj]; + if (offb + ti * m + tj < totalb) Bs[ti][tj] = bstart[ti * m + tj]; + + __syncthreads(); + int eu = block_size < k - d ? block_size : k - d; + for (int e = 0; e < eu; e++) { + val += As[ti][e] * Bs[e][tj]; + } + __syncthreads(); + } + double* cstart = c + lux * m + luy; + if (lux + ti < n && luy + tj < m) { + cstart[ti * m + tj] = val; + } +} + +/* + +warmUpTime:0 +# +805128192 805128192 805128192 +start multiplication +multiplication:0 +copy back time:96735 +yes +*/ + + + +void DenseMatMul(DenseMatrix A, DenseMatrix B, DenseMatrix C) { + double t0, t1, t2; + int N = A.R, K = A.C, M = B.C; + int size_a = N * K * sizeof(double); + int size_b = K * M * sizeof(double); + int size_c = N * M * sizeof(double); + double* a; + double* b; + double* c; + + cout << "#\n"; + cout << size_a << ' ' << size_b << ' ' << size_c << endl; + cudaMalloc(&a, size_a); + cudaMalloc(&b, size_b); + cudaMalloc(&c, size_c); + + + if (!a) { + cout << "!a\n"; + return; + } + if (!b) { + cout << "!b\n"; + return; + } + if (!c) { + cout << "!c\n"; + return; + } + cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice); + cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice); + + const int block_size = 32; + int BX = (N + block_size - 1) / block_size, BY = (M + block_size - 1) / block_size; + dim3 thread_nums(block_size, block_size); + dim3 block_nums(BX, BY); + + cout << "start multiplication\n"; + t0 = time(); + DenseMatMulBlock << > > (a, b, c, N, K, M); + t1 = time(); + cout << "multiplication:" << t1 - t0 << endl; + cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost); + + t2 = time(); + cout << "copy back time:" << t2 - t1 << endl; + cudaFree(a); + cudaFree(b); + cudaFree(c); + +} + + +void check(DenseMatrix A, DenseMatrix B, DenseMatrix C) { + cublasHandle_t handle; + cublasCreate(&handle); + double alpha = 1.0, beta = -1.0; + double* a; + double* b; + double* c; + int N = A.R; + int K = A.C; + int M = B.C; + cout << M << " " << K << ' ' << N << endl; + int size_a = N * K * sizeof(double); + int size_b = K * M * sizeof(double); + int size_c = N * M * sizeof(double); + cudaMalloc(&a, size_a); + cudaMalloc(&b, size_b); + cudaMalloc(&c, size_c); + + cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice); + cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice); + cudaMemcpy(c, C.v, size_c, cudaMemcpyHostToDevice); + + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, b, M, a, K, &beta, c, M); + cublasDestroy(handle); + + cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost); + + + cudaFree(a); + cudaFree(b); + cudaFree(c); + +} \ No newline at end of file diff --git a/SpMM.h b/SpMM.h new file mode 100644 index 0000000..48d988a --- /dev/null +++ b/SpMM.h @@ -0,0 +1,153 @@ +#pragma once +#include +using std::cout; +using std::endl; + +struct tpl { + int r, c; + double val; +}; +struct SparseMatrix { + int n, m, nnz; + tpl* coo; + SparseMatrix(int n, int m, int nnz) :n(n), m(m), nnz(nnz) { + coo = (tpl*)malloc(sizeof(tpl) * nnz); + for (int i = 0; i < nnz; i++) coo[i] = { i / m,i % m,1 }; + } + SparseMatrix(char* s) { + int r, c; + double va; + std::ifstream fin; + fin.open(s, std::ios::in); + fin >> n >> m >> nnz; + coo = (tpl*)malloc(sizeof(tpl) * nnz); + for (int i = 0; i < nnz; i++) { + fin >> r >> c >> va; + coo[i] = { r,c,va }; + } + fin.close(); + } + void print() { + std::vector>a(n, std::vector(m)); + for (int i = 0; i < nnz; i++) { + a[coo[i].r][coo[i].c] = coo[i].val; + } + for (int i = 0; i < n; i++) { + for (int j = 0; j < m; j++) { + cout << std::fixed << std::setprecision(3) << a[i][j] << ' '; + }cout << endl; + } + } + void del() { + cout << "!"; + free(coo); + cout << "ok\n"; + } +}; + +struct DCSR { + int* nzId; + int* offset; + int* e; + double* w; + int nnz; + int r, c; + DCSR() { + nzId=offset=e=nullptr; + w = nullptr; + nnz=r=c=0; + } + DCSR(SparseMatrix m, bool COL = 0, bool compress = 0) { + int R = COL ? m.m : m.n; + int C = COL ? m.n : m.m; + r = R, c = C; + int U = m.nnz; + int r, c; + double v; + std::vector > >rec(R); + if (compress) { + std::vectorids; + for (int i = 0; i < U; i++) { + tpl t = m.coo[i]; + r = t.r; + c = t.c; + if (COL) std::swap(r, c); + v = t.val; + rec[r].push_back({ c,v }); + if ((int)rec[r].size() == 1) { + ids.push_back(r); + } + } + int K = ids.size(); + nzId = (int*)malloc(K * sizeof(int)); + offset = (int*)malloc((K + 1) * sizeof(int)); + e = (int*)malloc(U * sizeof(int)); + w = (double*)malloc(U * sizeof(double)); + for (int i = 0; i < K; i++) nzId[i] = ids[i]; + int cur = 0; + for (int i = 0; i < K; i++) { + offset[i] = cur; + sort(rec[nzId[i]].begin(), rec[nzId[i]].end()); + for (auto& [c, v] : rec[nzId[i]]) { + e[cur] = c; + w[cur] = v; + cur++; + } + } + offset[K] = cur; + assert(cur == U); + } + else { + std::vectorids; + for (int i = 0; i < U; i++) { + tpl t = m.coo[i]; + r = t.r; + c = t.c; + if (COL) std::swap(r, c); + v = t.val; + rec[r].push_back({ c,v }); + } + offset = (int*)malloc((R + 1) * sizeof(int)); + e = (int*)malloc(U * sizeof(int)); + w = (double*)malloc(U * sizeof(double)); + int cur = 0; + for (int i = 0; i < R; i++) { + offset[i] = cur; + sort(rec[i].begin(), rec[i].end()); + for (auto& [c, v] : rec[i]) { + e[cur] = c; + w[cur] = v; + cur++; + } + } + offset[R] = cur; + assert(cur == U); + } + nnz = U; + } + void print() { + cout << r << ' ' << c << ":" << nnz << endl; + for (int i = 0; i <= r; i++) { + cout << offset[i] << ' '; + }cout << endl; + for (int i = 0; i < nnz; i++) { + cout << e[i] << ' '; + }cout << endl; + for (int i = 0; i < nnz; i++) { + cout << w[i] << ' '; + }cout << endl; + } + void del() { + if (nzId != nullptr) free(nzId); + free(offset); + free(e); + free(w); + } + void setspace(int R,int nnz){ + if(offset!=nullptr) this->del(); + offset=new int [R]; + e=new int[nnz]; + w=new double[nnz]; + } +}; + diff --git a/cusparse_matmul.h b/cusparse_matmul.h new file mode 100644 index 0000000..ddba8ab --- /dev/null +++ b/cusparse_matmul.h @@ -0,0 +1,256 @@ +#pragma once + +#include +#define CHECK_ERROR(V) {\ + status=V;\ + if(status!=CUSPARSE_STATUS_SUCCESS){cout<<"error at:"<<__LINE__;}\ +} + +void check(SparseMatrix A, SparseMatrix B, DCSR* ret) { + cusparseHandle_t handle = 0; + cusparseStatus_t status; + csrgemm2Info_t info = NULL; + CHECK_ERROR(cusparseCreate(&handle)); + + cusparseMatDescr_t descrA, descrB, descrC, descrD; + cusparseCreateMatDescr(&descrA); + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); + + cusparseCreateMatDescr(&descrB); + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); + + cusparseCreateMatDescr(&descrC); + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); + + cusparseCreateMatDescr(&descrD); + cusparseSetMatType(descrD, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descrD, CUSPARSE_INDEX_BASE_ZERO); + + + //////////////////the following described some usages: + //////////////////note that cublas and cusparse are column prioritized + + /* + + DenseMatrix MAT(8,4); + MAT[0][1]=MAT[0][2]=0; + MAT[1][3]=0; + MAT[4][3]=0; + MAT[5][2]=0; + MAT[6][0]=MAT[6][1]=MAT[6][2]=0; + + int*nnz_RowA; + int nnzA; + cudaMalloc(&nnz_RowA, sizeof(int)*MAT.R); + + int size_a = sizeof(double)*MAT.R*MAT.C; + double*a; + cudaMalloc(&a, size_a); + cudaMemcpy(a, MAT.v, size_a, cudaMemcpyHostToDevice); + + + MAT.println(); + + CHECK_ERROR(cusparseDnnz(handle, CUSPARSE_DIRECTION_COLUMN, MAT.C, MAT.R, descrA, a, MAT.C, nnz_RowA, &nnzA)); + cout<<"nnzA:"<r = n; + ret->c = m; + ret->nnz = nnzC; + ret->offset = (int*)malloc(sizeof(int) * (n + 1)); + ret->e = (int*)malloc(sizeof(int) * (nnzC)); + ret->w = (double*)malloc(sizeof(double) * (nnzC)); + + cudaMemcpy(ret->offset, csrRowPtrC, sizeof(int) * (n + 1), cudaMemcpyDeviceToHost); + cudaMemcpy(ret->e, csrColIdxC, sizeof(int) * nnzC, cudaMemcpyDeviceToHost); + cudaMemcpy(ret->w, csrValC, sizeof(double) * nnzC, cudaMemcpyDeviceToHost); + + cout << nnzC << endl; + for (int i = 0; i <= n; i++) { + cout << ret->offset[i] << ' '; + }cout << endl; + for (int i = 0; i < nnzC; i++) { + cout << ret->e[i] << ' '; + }cout << endl; + for (int i = 0; i < nnzC; i++) { + cout << std::fixed << std::setprecision(3) << ret->w[i] << ' '; + } + cout << endl; + + + cusparseDestroyCsrgemm2Info(info); + cudaFree(csrRowPtrA); + cudaFree(csrRowPtrB); + cudaFree(csrRowPtrC); + cudaFree(csrRowPtrD); + cudaFree(csrColIdxA); + cudaFree(csrColIdxB); + cudaFree(csrColIdxC); + cudaFree(csrColIdxD); + cudaFree(csrValA); + cudaFree(csrValB); + cudaFree(csrValC); + cudaFree(csrValD); +} + +#undef CHECK_ERROR diff --git a/gen.cpp b/gen.cpp new file mode 100644 index 0000000..37eabc5 --- /dev/null +++ b/gen.cpp @@ -0,0 +1,59 @@ +#include +#define rep(i,a,b) for(int i=(a),i##ss=(b);i<=i##ss;i++) +#define dwn(i,a,b) for(int i=(a),i##ss=(b);i>=i##ss;i--) +#define rng(i,a,b) for(int i=(a);i<(b);i++) +#define deb(x) cerr<<(#x)<<":"<<(x)<<'\n' +#define pb push_back +#define mkp make_pair +#define fi first +#define se second +#define hvie '\n' +using namespace std; +typedef pair pii; +typedef long long ll; +typedef unsigned long long ull; +typedef double db; +int yh(){ + int ret=0;bool f=0;char c=getchar(); + while(!isdigit(c)){if(c==EOF)return -1;if(c=='-')f=1;c=getchar();} + while(isdigit(c))ret=(ret<<3)+(ret<<1)+(c^48),c=getchar(); + return f?-ret:ret; +} +const int maxn=3e5+5; +ull randll(){ + static ull seed=1141514; + seed ^= seed << 13; + seed ^= seed >> 7; + seed ^= seed << 17; + return seed; +} +double randreal(){ + return (long long)randll()*1.0/LLONG_MAX; +} + + +signed main(){ + int n=yh(),m=yh(),nnz=yh(); + assert(0.9*n*m>=nnz); + + string name; + cin>>name; + + freopen(name.c_str(),"w", stdout); + cout<>pos; + + for(int i=1;i<=nnz;i++){ + int x,y; + do{ + x=randll()%n; + y=randll()%m; + }while(pos.count({x,y})); + pos.insert({x,y}); + cout< +#include +#include +#include +#include +#include +#include +#include + +double randreal(){return 0;} + +using namespace std; + +struct tpl{ + int r,c; + double val; +}; +struct SparseMatrix{ + int n,m,nnz; + tpl *coo; + SparseMatrix(int n,int m,int nnz):n(n),m(m),nnz(nnz){ + coo=(tpl*)malloc(sizeof(tpl)*nnz); + for(int i=0;i>n>>m>>nnz; + coo=(tpl*)malloc(sizeof(tpl)*nnz); + for(int i=0;i>r>>c>>va; + coo[i]={r,c,va}; + } + fin.close(); + } + void print(){ + std::vector>a(n,std::vector(m)); + for(int i=0;i > >rec(R); + if(compress){ + std::vectorids; + for(int i=0;iids; + for(int i=0;i +struct KV{ + int k; + T v; +}; + + +using ull=unsigned long long; +int a[1000]; +int find(int x){ + int l=0,r=999,mid; + while(l<=r){ + mid=(l+r)>>1; + if(a[mid]==x) { + return mid; + } + else if(a[mid]>x){ + r=mid-1; + } + else{ + l=mid+1; + } + } + return -1; +} +int main(){ + for(int i=0;i<1000;i++){ + a[i]=2*i; + } + int x;cin>>x; + cout< +#define time GetTickCount +#endif \ No newline at end of file