Skip to content

Commit

Permalink
1
Browse files Browse the repository at this point in the history
1
  • Loading branch information
vandoor authored Aug 2, 2023
1 parent 5f56cc8 commit 9267d33
Show file tree
Hide file tree
Showing 6 changed files with 860 additions and 0 deletions.
223 changes: 223 additions & 0 deletions DMM.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#pragma once
#include "timedef.h"
#include <cublas_v2.h>

typedef unsigned int uint;
template<typename FLOAT_T>
struct DenseMatrix {
int R, C;
FLOAT_T* v;
DenseMatrix(int R, int C) :R(R), C(C) {
v = (FLOAT_T*)malloc(R * C * sizeof(FLOAT_T));
for (int i = 0; i < R * C; i++) {
v[i] = randreal();
// v[i]=(FLOAT_T)1;
// v[i]=(i/C);
}
}
void del() {
free(v);
}
void clear() {
for (int i = 0; i < R * C; i++) v[i] = 0;
}
FLOAT_T* operator[](int i) {
return v + i * C;
}
void println() {
cout << "[";
for (int i = 0; i < R; i++) {
cout << '[';
for (int j = 0; j < C; j++) {
cout << v[i * C + j];
if (j == C - 1) {
if (i == R - 1) cout << "]]\n";
else cout << "],\n";
}
else cout << ",";
}
}
cout << "--------------------------------------\n";
}
void print() {
cout << "[";
for (int i = 0; i < R; i++) {
cout << '[';
for (int j = 0; j < C; j++) {
cout << v[i * C + j];
if (j == C - 1) {
if (i == R - 1) cout << "]]";
else cout << "],";
}
else cout << ",";
}
}
cout << "--------------------------------------\n";
}
void printErr() {
FLOAT_T s = 0;
for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) {
s += fabs(v[i * C + j]);
}
cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n';
}
void printErr(const DenseMatrix& d) {
FLOAT_T s = 0;
for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) {
s += fabs(v[i * C + j] - d.v[i * C + j]);
}
cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n';
}
};

__global__ void DenseMatMul(double* a, double* b, double* c, int n, int k, int m) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i < n && j < m) {
double* val = c + i * m + j;
*val = 0;
for (int l = 0; l < k; l++) {
*val += a[i * k + l] * b[l * m + j];
}
}
}


template<uint block_size>
__global__ void DenseMatMulBlock(double* a, double* b, double* c, int n, int k, int m) {
int blockX = blockIdx.x;
int blockY = blockIdx.y;
int lux = blockX * block_size;
int luy = blockY * block_size;

int totala = n * k;
int totalb = k * m;
int ti = threadIdx.x, tj = threadIdx.y;
double val = 0;


for (int d = 0; d < k; d += block_size) {
__shared__ double As[block_size][block_size];
__shared__ double Bs[block_size][block_size];

int offa = lux * k + d;
int offb = d * m + luy;

double* astart = a + offa;
double* bstart = b + offb;

if (offa + ti * k + tj < totala) As[ti][tj] = astart[ti * k + tj];
if (offb + ti * m + tj < totalb) Bs[ti][tj] = bstart[ti * m + tj];

__syncthreads();
int eu = block_size < k - d ? block_size : k - d;
for (int e = 0; e < eu; e++) {
val += As[ti][e] * Bs[e][tj];
}
__syncthreads();
}
double* cstart = c + lux * m + luy;
if (lux + ti < n && luy + tj < m) {
cstart[ti * m + tj] = val;
}
}

/*
warmUpTime:0
#
805128192 805128192 805128192
start multiplication
multiplication:0
copy back time:96735
yes
*/



void DenseMatMul(DenseMatrix<double> A, DenseMatrix<double> B, DenseMatrix<double> C) {
double t0, t1, t2;
int N = A.R, K = A.C, M = B.C;
int size_a = N * K * sizeof(double);
int size_b = K * M * sizeof(double);
int size_c = N * M * sizeof(double);
double* a;
double* b;
double* c;

cout << "#\n";
cout << size_a << ' ' << size_b << ' ' << size_c << endl;
cudaMalloc(&a, size_a);
cudaMalloc(&b, size_b);
cudaMalloc(&c, size_c);


if (!a) {
cout << "!a\n";
return;
}
if (!b) {
cout << "!b\n";
return;
}
if (!c) {
cout << "!c\n";
return;
}
cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice);

const int block_size = 32;
int BX = (N + block_size - 1) / block_size, BY = (M + block_size - 1) / block_size;
dim3 thread_nums(block_size, block_size);
dim3 block_nums(BX, BY);

cout << "start multiplication\n";
t0 = time();
DenseMatMulBlock<block_size> << <block_nums, thread_nums >> > (a, b, c, N, K, M);
t1 = time();
cout << "multiplication:" << t1 - t0 << endl;
cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost);

t2 = time();
cout << "copy back time:" << t2 - t1 << endl;
cudaFree(a);
cudaFree(b);
cudaFree(c);

}


void check(DenseMatrix<double> A, DenseMatrix<double> B, DenseMatrix<double> C) {
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0, beta = -1.0;
double* a;
double* b;
double* c;
int N = A.R;
int K = A.C;
int M = B.C;
cout << M << " " << K << ' ' << N << endl;
int size_a = N * K * sizeof(double);
int size_b = K * M * sizeof(double);
int size_c = N * M * sizeof(double);
cudaMalloc(&a, size_a);
cudaMalloc(&b, size_b);
cudaMalloc(&c, size_c);

cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice);
cudaMemcpy(c, C.v, size_c, cudaMemcpyHostToDevice);

cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, b, M, a, K, &beta, c, M);
cublasDestroy(handle);

cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost);


cudaFree(a);
cudaFree(b);
cudaFree(c);

}
153 changes: 153 additions & 0 deletions SpMM.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#pragma once
#include <iostream>
using std::cout;
using std::endl;

struct tpl {
int r, c;
double val;
};
struct SparseMatrix {
int n, m, nnz;
tpl* coo;
SparseMatrix(int n, int m, int nnz) :n(n), m(m), nnz(nnz) {
coo = (tpl*)malloc(sizeof(tpl) * nnz);
for (int i = 0; i < nnz; i++) coo[i] = { i / m,i % m,1 };
}
SparseMatrix(char* s) {
int r, c;
double va;
std::ifstream fin;
fin.open(s, std::ios::in);
fin >> n >> m >> nnz;
coo = (tpl*)malloc(sizeof(tpl) * nnz);
for (int i = 0; i < nnz; i++) {
fin >> r >> c >> va;
coo[i] = { r,c,va };
}
fin.close();
}
void print() {
std::vector<std::vector<double>>a(n, std::vector<double>(m));
for (int i = 0; i < nnz; i++) {
a[coo[i].r][coo[i].c] = coo[i].val;
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
cout << std::fixed << std::setprecision(3) << a[i][j] << ' ';
}cout << endl;
}
}
void del() {
cout << "!";
free(coo);
cout << "ok\n";
}
};

struct DCSR {
int* nzId;
int* offset;
int* e;
double* w;
int nnz;
int r, c;
DCSR() {
nzId=offset=e=nullptr;
w = nullptr;
nnz=r=c=0;
}
DCSR(SparseMatrix m, bool COL = 0, bool compress = 0) {
int R = COL ? m.m : m.n;
int C = COL ? m.n : m.m;
r = R, c = C;
int U = m.nnz;
int r, c;
double v;
std::vector<std::vector<std::pair<int, double> > >rec(R);
if (compress) {
std::vector<int>ids;
for (int i = 0; i < U; i++) {
tpl t = m.coo[i];
r = t.r;
c = t.c;
if (COL) std::swap(r, c);
v = t.val;
rec[r].push_back({ c,v });
if ((int)rec[r].size() == 1) {
ids.push_back(r);
}
}
int K = ids.size();
nzId = (int*)malloc(K * sizeof(int));
offset = (int*)malloc((K + 1) * sizeof(int));
e = (int*)malloc(U * sizeof(int));
w = (double*)malloc(U * sizeof(double));
for (int i = 0; i < K; i++) nzId[i] = ids[i];
int cur = 0;
for (int i = 0; i < K; i++) {
offset[i] = cur;
sort(rec[nzId[i]].begin(), rec[nzId[i]].end());
for (auto& [c, v] : rec[nzId[i]]) {
e[cur] = c;
w[cur] = v;
cur++;
}
}
offset[K] = cur;
assert(cur == U);
}
else {
std::vector<int>ids;
for (int i = 0; i < U; i++) {
tpl t = m.coo[i];
r = t.r;
c = t.c;
if (COL) std::swap(r, c);
v = t.val;
rec[r].push_back({ c,v });
}
offset = (int*)malloc((R + 1) * sizeof(int));
e = (int*)malloc(U * sizeof(int));
w = (double*)malloc(U * sizeof(double));
int cur = 0;
for (int i = 0; i < R; i++) {
offset[i] = cur;
sort(rec[i].begin(), rec[i].end());
for (auto& [c, v] : rec[i]) {
e[cur] = c;
w[cur] = v;
cur++;
}
}
offset[R] = cur;
assert(cur == U);
}
nnz = U;
}
void print() {
cout << r << ' ' << c << ":" << nnz << endl;
for (int i = 0; i <= r; i++) {
cout << offset[i] << ' ';
}cout << endl;
for (int i = 0; i < nnz; i++) {
cout << e[i] << ' ';
}cout << endl;
for (int i = 0; i < nnz; i++) {
cout << w[i] << ' ';
}cout << endl;
}
void del() {
if (nzId != nullptr) free(nzId);
free(offset);
free(e);
free(w);
}
void setspace(int R,int nnz){
if(offset!=nullptr) this->del();
offset=new int [R];
e=new int[nnz];
w=new double[nnz];
}
};

Loading

0 comments on commit 9267d33

Please sign in to comment.