From 9267d33fc397d162ffa41e334b1dcc8eb6e993bd Mon Sep 17 00:00:00 2001
From: vandoor <41376266+vandoor@users.noreply.github.com>
Date: Wed, 2 Aug 2023 15:07:08 +0800
Subject: [PATCH] 1

1
---
 DMM.h             | 223 ++++++++++++++++++++++++++++++++++++++++
 SpMM.h            | 153 +++++++++++++++++++++++++++
 cusparse_matmul.h | 256 ++++++++++++++++++++++++++++++++++++++++++++++
 gen.cpp           |  59 +++++++++++
 test.cpp          | 164 +++++++++++++++++++++++++++++
 timedef.h         |   5 +
 6 files changed, 860 insertions(+)
 create mode 100644 DMM.h
 create mode 100644 SpMM.h
 create mode 100644 cusparse_matmul.h
 create mode 100644 gen.cpp
 create mode 100644 test.cpp
 create mode 100644 timedef.h
diff --git a/DMM.h b/DMM.h
new file mode 100644
index 0000000..8a5ef86
--- /dev/null
+++ b/DMM.h
@@ -0,0 +1,223 @@
+#pragma once
+#include "timedef.h"
+#include <cublas_v2.h>
+
+typedef unsigned int uint;
+template<typename FLOAT_T>
+struct DenseMatrix {
+	int R, C;
+	FLOAT_T* v;
+	DenseMatrix(int R, int C) :R(R), C(C) {
+		v = (FLOAT_T*)malloc(R * C * sizeof(FLOAT_T));
+		for (int i = 0; i < R * C; i++) {
+			v[i] = randreal();
+			// v[i]=(FLOAT_T)1;
+			// v[i]=(i/C);
+		}
+	}
+	void del() {
+		free(v);
+	}
+	void clear() {
+		for (int i = 0; i < R * C; i++) v[i] = 0;
+	}
+	FLOAT_T* operator[](int i) {
+		return v + i * C;
+	}
+	void println() {
+		cout << "[";
+		for (int i = 0; i < R; i++) {
+			cout << '[';
+			for (int j = 0; j < C; j++) {
+				cout << v[i * C + j];
+				if (j == C - 1) {
+					if (i == R - 1) cout << "]]\n";
+					else cout << "],\n";
+				}
+				else cout << ",";
+			}
+		}
+		cout << "--------------------------------------\n";
+	}
+	void print() {
+		cout << "[";
+		for (int i = 0; i < R; i++) {
+			cout << '[';
+			for (int j = 0; j < C; j++) {
+				cout << v[i * C + j];
+				if (j == C - 1) {
+					if (i == R - 1) cout << "]]";
+					else cout << "],";
+				}
+				else cout << ",";
+			}
+		}
+		cout << "--------------------------------------\n";
+	}
+	void printErr() {
+		FLOAT_T s = 0;
+		for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) {
+			s += fabs(v[i * C + j]);
+		}
+		cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n';
+	}
+	void printErr(const DenseMatrix& d) {
+		FLOAT_T s = 0;
+		for (int i = 0; i < R; i++)for (int j = 0; j < C; j++) {
+			s += fabs(v[i * C + j] - d.v[i * C + j]);
+		}
+		cout << std::fixed << std::setprecision(5) << "sum=" << s << '\n';
+	}
+};
+
+__global__ void DenseMatMul(double* a, double* b, double* c, int n, int k, int m) {
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int j = blockDim.y * blockIdx.y + threadIdx.y;
+	if (i < n && j < m) {
+		double* val = c + i * m + j;
+		*val = 0;
+		for (int l = 0; l < k; l++) {
+			*val += a[i * k + l] * b[l * m + j];
+		}
+	}
+}
+
+
+template<uint block_size>
+__global__ void DenseMatMulBlock(double* a, double* b, double* c, int n, int k, int m) {
+	int blockX = blockIdx.x;
+	int blockY = blockIdx.y;
+	int lux = blockX * block_size;
+	int luy = blockY * block_size;
+
+	int totala = n * k;
+	int totalb = k * m;
+	int ti = threadIdx.x, tj = threadIdx.y;
+	double val = 0;
+
+
+	for (int d = 0; d < k; d += block_size) {
+		__shared__ double As[block_size][block_size];
+		__shared__ double Bs[block_size][block_size];
+
+		int offa = lux * k + d;
+		int offb = d * m + luy;
+
+		double* astart = a + offa;
+		double* bstart = b + offb;
+
+		if (offa + ti * k + tj < totala) As[ti][tj] = astart[ti * k + tj];
+		if (offb + ti * m + tj < totalb) Bs[ti][tj] = bstart[ti * m + tj];
+
+		__syncthreads();
+		int eu = block_size < k - d ? block_size : k - d;
+		for (int e = 0; e < eu; e++) {
+			val += As[ti][e] * Bs[e][tj];
+		}
+		__syncthreads();
+	}
+	double* cstart = c + lux * m + luy;
+	if (lux + ti < n && luy + tj < m) {
+		cstart[ti * m + tj] = val;
+	}
+}
+
+/*
+
+warmUpTime:0
+#
+805128192 805128192 805128192
+start multiplication
+multiplication:0
+copy back time:96735
+yes
+*/
+
+
+
+void DenseMatMul(DenseMatrix<double> A, DenseMatrix<double> B, DenseMatrix<double> C) {
+	double t0, t1, t2;
+	int N = A.R, K = A.C, M = B.C;
+	int size_a = N * K * sizeof(double);
+	int size_b = K * M * sizeof(double);
+	int size_c = N * M * sizeof(double);
+	double* a;
+	double* b;
+	double* c;
+
+	cout << "#\n";
+	cout << size_a << ' ' << size_b << ' ' << size_c << endl;
+	cudaMalloc(&a, size_a);
+	cudaMalloc(&b, size_b);
+	cudaMalloc(&c, size_c);
+
+
+	if (!a) {
+		cout << "!a\n";
+		return;
+	}
+	if (!b) {
+		cout << "!b\n";
+		return;
+	}
+	if (!c) {
+		cout << "!c\n";
+		return;
+	}
+	cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice);
+	cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice);
+
+	const int block_size = 32;
+	int BX = (N + block_size - 1) / block_size, BY = (M + block_size - 1) / block_size;
+	dim3 thread_nums(block_size, block_size);
+	dim3 block_nums(BX, BY);
+
+	cout << "start multiplication\n";
+	t0 = time();
+	DenseMatMulBlock<block_size> << <block_nums, thread_nums >> > (a, b, c, N, K, M);
+	t1 = time();
+	cout << "multiplication:" << t1 - t0 << endl;
+	cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost);
+
+	t2 = time();
+	cout << "copy back time:" << t2 - t1 << endl;
+	cudaFree(a);
+	cudaFree(b);
+	cudaFree(c);
+
+}
+
+
+void check(DenseMatrix<double> A, DenseMatrix<double> B, DenseMatrix<double> C) {
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+	double alpha = 1.0, beta = -1.0;
+	double* a;
+	double* b;
+	double* c;
+	int N = A.R;
+	int K = A.C;
+	int M = B.C;
+	cout << M << " " << K << ' ' << N << endl;
+	int size_a = N * K * sizeof(double);
+	int size_b = K * M * sizeof(double);
+	int size_c = N * M * sizeof(double);
+	cudaMalloc(&a, size_a);
+	cudaMalloc(&b, size_b);
+	cudaMalloc(&c, size_c);
+
+	cudaMemcpy(a, A.v, size_a, cudaMemcpyHostToDevice);
+	cudaMemcpy(b, B.v, size_b, cudaMemcpyHostToDevice);
+	cudaMemcpy(c, C.v, size_c, cudaMemcpyHostToDevice);
+
+	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, b, M, a, K, &beta, c, M);
+	cublasDestroy(handle);
+
+	cudaMemcpy(C.v, c, size_c, cudaMemcpyDeviceToHost);
+
+
+	cudaFree(a);
+	cudaFree(b);
+	cudaFree(c);
+
+}
\ No newline at end of file
diff --git a/SpMM.h b/SpMM.h
new file mode 100644
index 0000000..48d988a
--- /dev/null
+++ b/SpMM.h
@@ -0,0 +1,153 @@
+#pragma once
+#include <iostream>
+using std::cout;
+using std::endl;
+
+struct tpl {
+	int r, c;
+	double val;
+};
+struct SparseMatrix {
+	int n, m, nnz;
+	tpl* coo;
+	SparseMatrix(int n, int m, int nnz) :n(n), m(m), nnz(nnz) {
+		coo = (tpl*)malloc(sizeof(tpl) * nnz);
+		for (int i = 0; i < nnz; i++) coo[i] = { i / m,i % m,1 };
+	}
+	SparseMatrix(char* s) {
+		int r, c;
+		double va;
+		std::ifstream fin;
+		fin.open(s, std::ios::in);
+		fin >> n >> m >> nnz;
+		coo = (tpl*)malloc(sizeof(tpl) * nnz);
+		for (int i = 0; i < nnz; i++) {
+			fin >> r >> c >> va;
+			coo[i] = { r,c,va };
+		}
+		fin.close();
+	}
+	void print() {
+		std::vector<std::vector<double>>a(n, std::vector<double>(m));
+		for (int i = 0; i < nnz; i++) {
+			a[coo[i].r][coo[i].c] = coo[i].val;
+		}
+		for (int i = 0; i < n; i++) {
+			for (int j = 0; j < m; j++) {
+				cout << std::fixed << std::setprecision(3) << a[i][j] << ' ';
+			}cout << endl;
+		}
+	}
+	void del() {
+		cout << "!";
+		free(coo);
+		cout << "ok\n";
+	}
+};
+
+struct DCSR {
+	int* nzId;
+	int* offset;
+	int* e;
+	double* w;
+	int nnz;
+	int r, c;
+	DCSR() {
+		nzId=offset=e=nullptr;
+		w = nullptr;
+		nnz=r=c=0;
+	}
+	DCSR(SparseMatrix m, bool COL = 0, bool compress = 0) {
+		int R = COL ? m.m : m.n;
+		int C = COL ? m.n : m.m;
+		r = R, c = C;
+		int U = m.nnz;
+		int r, c;
+		double v;
+		std::vector<std::vector<std::pair<int, double> > >rec(R);
+		if (compress) {
+			std::vector<int>ids;
+			for (int i = 0; i < U; i++) {
+				tpl t = m.coo[i];
+				r = t.r;
+				c = t.c;
+				if (COL) std::swap(r, c);
+				v = t.val;
+				rec[r].push_back({ c,v });
+				if ((int)rec[r].size() == 1) {
+					ids.push_back(r);
+				}
+			}
+			int K = ids.size();
+			nzId = (int*)malloc(K * sizeof(int));
+			offset = (int*)malloc((K + 1) * sizeof(int));
+			e = (int*)malloc(U * sizeof(int));
+			w = (double*)malloc(U * sizeof(double));
+			for (int i = 0; i < K; i++) nzId[i] = ids[i];
+			int cur = 0;
+			for (int i = 0; i < K; i++) {
+				offset[i] = cur;
+				sort(rec[nzId[i]].begin(), rec[nzId[i]].end());
+				for (auto& [c, v] : rec[nzId[i]]) {
+					e[cur] = c;
+					w[cur] = v;
+					cur++;
+				}
+			}
+			offset[K] = cur;
+			assert(cur == U);
+		}
+		else {
+			std::vector<int>ids;
+			for (int i = 0; i < U; i++) {
+				tpl t = m.coo[i];
+				r = t.r;
+				c = t.c;
+				if (COL) std::swap(r, c);
+				v = t.val;
+				rec[r].push_back({ c,v });
+			}
+			offset = (int*)malloc((R + 1) * sizeof(int));
+			e = (int*)malloc(U * sizeof(int));
+			w = (double*)malloc(U * sizeof(double));
+			int cur = 0;
+			for (int i = 0; i < R; i++) {
+				offset[i] = cur;
+				sort(rec[i].begin(), rec[i].end());
+				for (auto& [c, v] : rec[i]) {
+					e[cur] = c;
+					w[cur] = v;
+					cur++;
+				}
+			}
+			offset[R] = cur;
+			assert(cur == U);
+		}
+		nnz = U;
+	}
+	void print() {
+		cout << r << ' ' << c << ":" << nnz << endl;
+		for (int i = 0; i <= r; i++) {
+			cout << offset[i] << ' ';
+		}cout << endl;
+		for (int i = 0; i < nnz; i++) {
+			cout << e[i] << ' ';
+		}cout << endl;
+		for (int i = 0; i < nnz; i++) {
+			cout << w[i] << ' ';
+		}cout << endl;
+	}
+	void del() {
+		if (nzId != nullptr) free(nzId);
+		free(offset);
+		free(e);
+		free(w);
+	}
+	void setspace(int R,int nnz){
+		if(offset!=nullptr) this->del();
+		offset=new int [R];
+		e=new int[nnz];
+		w=new double[nnz];
+	}
+};
+
diff --git a/cusparse_matmul.h b/cusparse_matmul.h
new file mode 100644
index 0000000..ddba8ab
--- /dev/null
+++ b/cusparse_matmul.h
@@ -0,0 +1,256 @@
+#pragma once
+
+#include <cusparse.h>
+#define CHECK_ERROR(V) {\
+	status=V;\
+	if(status!=CUSPARSE_STATUS_SUCCESS){cout<<"error at:"<<__LINE__;}\
+}
+
+void check(SparseMatrix A, SparseMatrix B, DCSR* ret) {
+	cusparseHandle_t handle = 0;
+	cusparseStatus_t status;
+	csrgemm2Info_t info = NULL;
+	CHECK_ERROR(cusparseCreate(&handle));
+
+	cusparseMatDescr_t descrA, descrB, descrC, descrD;
+	cusparseCreateMatDescr(&descrA);
+	cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
+
+	cusparseCreateMatDescr(&descrB);
+	cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
+
+	cusparseCreateMatDescr(&descrC);
+	cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
+
+	cusparseCreateMatDescr(&descrD);
+	cusparseSetMatType(descrD, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descrD, CUSPARSE_INDEX_BASE_ZERO);
+
+
+	//////////////////the following described some usages:
+	//////////////////note that cublas and cusparse are column prioritized
+
+	/*
+
+		DenseMatrix<double> MAT(8,4);
+		MAT[0][1]=MAT[0][2]=0;
+		MAT[1][3]=0;
+		MAT[4][3]=0;
+		MAT[5][2]=0;
+		MAT[6][0]=MAT[6][1]=MAT[6][2]=0;
+
+		int*nnz_RowA;
+		int nnzA;
+		cudaMalloc(&nnz_RowA, sizeof(int)*MAT.R);
+
+		int size_a = sizeof(double)*MAT.R*MAT.C;
+		double*a;
+		cudaMalloc(&a, size_a);
+		cudaMemcpy(a, MAT.v, size_a, cudaMemcpyHostToDevice);
+
+
+		MAT.println();
+
+		CHECK_ERROR(cusparseDnnz(handle, CUSPARSE_DIRECTION_COLUMN, MAT.C, MAT.R, descrA, a, MAT.C, nnz_RowA, &nnzA));
+		cout<<"nnzA:"<<nnzA<<endl;
+		int *nnz_row = new int [MAT.R];
+		cudaMemcpy(nnz_row, nnz_RowA, sizeof(int)*MAT.R, cudaMemcpyDeviceToHost);
+		cout<<"nnz_RowA:"<<endl;
+		for(int i=0;i<MAT.R;i++){
+			cout<<nnz_row[i]<<" ";
+		}cout<<"~"<<endl;
+
+		delete[](nnz_row);
+
+		double*csrValA;
+		int*csrIdxA;
+		int*csrColPtrA;
+
+		cudaMalloc(&csrValA, sizeof(double)*nnzA);
+		cudaMalloc(&csrIdxA, sizeof(int)*MAT.R);
+		cudaMalloc(&csrColPtrA, sizeof(int)*nnzA);
+
+		CHECK_ERROR(cusparseDdense2csc(handle, MAT.C, MAT.R, descrA, a, MAT.C, nnz_RowA, csrValA, csrColPtrA, csrIdxA));
+
+		int*csrIdx=new int[MAT.R];
+		int*csrColPtr=new int[nnzA];
+		double*csrVal=new double[nnzA];
+		cudaMemcpy(csrIdx, csrIdxA, sizeof(int)*MAT.R, cudaMemcpyDeviceToHost);
+		cudaMemcpy(csrColPtr, csrColPtrA, sizeof(int)*nnzA, cudaMemcpyDeviceToHost);
+		cudaMemcpy(csrVal, csrValA, sizeof(double)*nnzA, cudaMemcpyDeviceToHost);
+
+		for(int i=0;i<MAT.R;i++){
+			cout<<csrIdx[i]<<' ';
+		}
+		cout<<endl;
+		for(int i=0;i<nnzA;i++){
+			cout<<csrColPtr[i]<<' ';
+		}
+		cout<<endl;
+		for(int i=0;i<nnzA;i++){
+			cout<<csrVal[i]<<' ';
+		}
+		cout<<endl;
+
+
+
+		delete[]csrIdx;
+		delete[]csrColPtr;
+		delete[]csrVal;
+
+	*/
+
+	A.print();
+	B.print();
+
+	DCSR CSRA(A, 0, 0);
+	DCSR CSRB(B, 0, 0);
+
+	// CSRB.print();
+
+	int n, m, k;
+	int nnzA = A.nnz;
+	int nnzB = B.nnz;
+	int nnzC;
+	int* nnzTotalDevHostPtr = &nnzC;
+	int baseC;
+
+	n = A.n;
+	k = A.m;
+	m = B.m;
+
+	cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
+	cusparseCreateCsrgemm2Info(&info);
+
+	int* csrRowPtrC;
+	int* csrRowPtrA;
+	int* csrRowPtrB;
+	int* csrRowPtrD;
+	int* csrColIdxA;
+	int* csrColIdxB;
+	int* csrColIdxC;
+	int* csrColIdxD;
+
+	cudaMalloc(&csrRowPtrA, sizeof(int) * (n + 1));
+	cudaMalloc(&csrRowPtrC, sizeof(int) * (n + 1));
+	cudaMalloc(&csrRowPtrB, sizeof(int) * (k + 1));
+	cudaMalloc(&csrRowPtrD, sizeof(int) * (n + 1));
+	cudaMalloc(&csrColIdxA, sizeof(int) * (nnzA));
+	cudaMalloc(&csrColIdxB, sizeof(int) * (nnzB));
+	cudaMalloc(&csrColIdxD, sizeof(int));
+
+	cudaMemcpy(csrRowPtrA, CSRA.offset, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
+	cudaMemcpy(csrRowPtrB, CSRB.offset, sizeof(int) * (k + 1), cudaMemcpyHostToDevice);
+
+	cudaMemcpy(csrColIdxA, CSRA.e, sizeof(int) * (nnzA), cudaMemcpyHostToDevice);
+	cudaMemcpy(csrColIdxB, CSRB.e, sizeof(int) * (nnzB), cudaMemcpyHostToDevice);
+
+	double alpha = 1.0;
+	double beta = 0.0;
+
+	size_t bufferSize;
+	void* buffer = NULL;
+
+	cusparseDcsrgemm2_bufferSizeExt(handle, n, m, k, &alpha,
+		descrA, nnzA, csrRowPtrA, csrColIdxA,
+		descrB, nnzB, csrRowPtrB, csrColIdxB,
+		NULL,
+		descrD, 0, csrRowPtrD, csrColIdxD,
+		info,
+		&bufferSize
+	);
+	cudaMalloc(&buffer, bufferSize);
+	cout << "bufferSize=" << bufferSize << "!\n";
+
+	cusparseXcsrgemm2Nnz(handle, n, m, k,
+		descrA, nnzA, csrRowPtrA, csrColIdxA,
+		descrB, nnzB, csrRowPtrB, csrColIdxB,
+		descrD, 0, csrRowPtrD, csrColIdxD,
+		descrC, csrRowPtrC, nnzTotalDevHostPtr,
+		info, buffer
+	);
+	if (nnzTotalDevHostPtr != nullptr) {
+		nnzC = *nnzTotalDevHostPtr;
+	}
+	else {
+		cudaMemcpy(&nnzC, csrRowPtrC + n, sizeof(int), cudaMemcpyDeviceToHost);
+		cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
+		nnzC -= baseC;
+	}
+
+
+	double* csrValA;
+	double* csrValB;
+	double* csrValC;
+	double* csrValD;
+	cudaMalloc(&csrColIdxC, sizeof(double) * nnzC);
+
+	cudaMalloc(&csrValA, sizeof(double) * nnzA);
+	cudaMalloc(&csrValB, sizeof(double) * nnzB);
+	cudaMalloc(&csrValC, sizeof(double) * nnzC);
+	cudaMalloc(&csrValD, sizeof(double) * 1);
+
+	cudaMemcpy(csrValA, CSRA.w, sizeof(double) * nnzA, cudaMemcpyHostToDevice);
+	cudaMemcpy(csrValB, CSRB.w, sizeof(double) * nnzB, cudaMemcpyHostToDevice);
+
+
+	cusparseDcsrgemm2(handle, n, m, k, &alpha,
+		descrA, nnzA, csrValA, csrRowPtrA, csrColIdxA,
+		descrB, nnzB, csrValB, csrRowPtrB, csrColIdxB,
+		NULL,
+		descrD, 0, csrValD, csrRowPtrD, csrColIdxD,
+		descrC, csrValC, csrRowPtrC, csrColIdxC,
+		info, buffer
+	);
+	/*
+	int *nzId;
+	int *offset;
+	int *e;
+	double *w;
+	int nnz;
+	int r,c;
+
+	*/
+	ret->r = n;
+	ret->c = m;
+	ret->nnz = nnzC;
+	ret->offset = (int*)malloc(sizeof(int) * (n + 1));
+	ret->e = (int*)malloc(sizeof(int) * (nnzC));
+	ret->w = (double*)malloc(sizeof(double) * (nnzC));
+
+	cudaMemcpy(ret->offset, csrRowPtrC, sizeof(int) * (n + 1), cudaMemcpyDeviceToHost);
+	cudaMemcpy(ret->e, csrColIdxC, sizeof(int) * nnzC, cudaMemcpyDeviceToHost);
+	cudaMemcpy(ret->w, csrValC, sizeof(double) * nnzC, cudaMemcpyDeviceToHost);
+
+	cout << nnzC << endl;
+	for (int i = 0; i <= n; i++) {
+		cout << ret->offset[i] << ' ';
+	}cout << endl;
+	for (int i = 0; i < nnzC; i++) {
+		cout << ret->e[i] << ' ';
+	}cout << endl;
+	for (int i = 0; i < nnzC; i++) {
+		cout << std::fixed << std::setprecision(3) << ret->w[i] << ' ';
+	}
+	cout << endl;
+
+
+	cusparseDestroyCsrgemm2Info(info);
+	cudaFree(csrRowPtrA);
+	cudaFree(csrRowPtrB);
+	cudaFree(csrRowPtrC);
+	cudaFree(csrRowPtrD);
+	cudaFree(csrColIdxA);
+	cudaFree(csrColIdxB);
+	cudaFree(csrColIdxC);
+	cudaFree(csrColIdxD);
+	cudaFree(csrValA);
+	cudaFree(csrValB);
+	cudaFree(csrValC);
+	cudaFree(csrValD);
+}
+
+#undef CHECK_ERROR
diff --git a/gen.cpp b/gen.cpp
new file mode 100644
index 0000000..37eabc5
--- /dev/null
+++ b/gen.cpp
@@ -0,0 +1,59 @@
+#include<bits/stdc++.h>
+#define rep(i,a,b) for(int i=(a),i##ss=(b);i<=i##ss;i++)
+#define dwn(i,a,b) for(int i=(a),i##ss=(b);i>=i##ss;i--)
+#define rng(i,a,b) for(int i=(a);i<(b);i++)
+#define deb(x) cerr<<(#x)<<":"<<(x)<<'\n'
+#define pb push_back
+#define mkp make_pair
+#define fi first
+#define se second
+#define hvie '\n'
+using namespace std;
+typedef pair<int,int> pii;
+typedef long long ll;
+typedef unsigned long long ull;
+typedef double db;
+int yh(){
+	int ret=0;bool f=0;char c=getchar();
+	while(!isdigit(c)){if(c==EOF)return -1;if(c=='-')f=1;c=getchar();}
+	while(isdigit(c))ret=(ret<<3)+(ret<<1)+(c^48),c=getchar();
+	return f?-ret:ret;
+}
+const int maxn=3e5+5;
+ull randll(){
+	static ull seed=1141514;
+	seed ^= seed << 13;
+	seed ^= seed >> 7;
+	seed ^= seed << 17;
+	return seed;
+}
+double randreal(){
+	return (long long)randll()*1.0/LLONG_MAX;
+}
+
+
+signed main(){
+	int n=yh(),m=yh(),nnz=yh();
+	assert(0.9*n*m>=nnz);
+
+	string name;
+	cin>>name;
+
+	freopen(name.c_str(),"w", stdout);
+	cout<<n<<' '<<m<<' '<<nnz<<'\n';
+
+	set<pair<int,int>>pos;
+
+	for(int i=1;i<=nnz;i++){
+		int x,y;
+		do{
+			x=randll()%n;
+			y=randll()%m;
+		}while(pos.count({x,y}));
+		pos.insert({x,y});
+		cout<<x<<' '<<y<<' '<<(x+y)%7+1<<"\n";
+		// cout<<x<<' '<<y<<' '<<randreal()<<'\n';
+	}
+
+	return 0;
+}
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..3c3f938
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,164 @@
+#include<iostream>
+#include<cstdio>
+#include<cstdlib>
+#include<array>
+#include<vector>
+#include<assert.h>
+#include<fstream>
+#include<iomanip>
+
+double randreal(){return 0;}
+
+using namespace std;
+
+struct tpl{
+	int r,c;
+	double val;
+};
+struct SparseMatrix{
+	int n,m,nnz;
+	tpl *coo;
+	SparseMatrix(int n,int m,int nnz):n(n),m(m),nnz(nnz){
+		coo=(tpl*)malloc(sizeof(tpl)*nnz);
+		for(int i=0;i<nnz;i++) coo[i]={i/m,i%m,randreal()};
+	}
+	SparseMatrix(char*s){
+		int r,c; 
+		double va;
+		std::ifstream fin;
+		fin.open(s, std::ios::in);
+		fin>>n>>m>>nnz;
+		coo=(tpl*)malloc(sizeof(tpl)*nnz);
+		for(int i=0;i<nnz;i++){
+			fin>>r>>c>>va;
+			coo[i]={r,c,va};
+		}
+		fin.close();
+	}
+	void print(){
+		std::vector<std::vector<double>>a(n,std::vector<double>(m));
+		for(int i=0;i<nnz;i++){
+			a[coo[i].r][coo[i].c]=coo[i].val;
+		}
+		for(int i=0;i<n;i++){
+			for(int j=0;j<m;j++){
+				cout<<std::fixed<<std::setprecision(3)<<a[i][j]<<' ';
+			}cout<<endl;
+		}
+	}
+	void del(){
+		cout<<"!";
+		free(coo);
+		cout<<"ok\n";
+	}
+};
+
+struct DCSR{
+	int *nzId;
+	int *offset;
+	int *e;
+	double *w;
+	int nnz;
+	int r,c;
+	DCSR(){}
+	DCSR(SparseMatrix m, bool COL=0, bool compress=0){
+		int R=COL?m.m:m.n;
+		int C=COL?m.n:m.m;
+		r=R, c=C;
+		int U=m.nnz;
+		int r,c;
+		double v;
+		std::vector<std::vector<std::pair<int,double> > >rec(R);
+		if(compress){
+			std::vector<int>ids;
+			for(int i=0;i<U;i++){
+				tpl t=m.coo[i];
+				r=t.r;
+				c=t.c;
+				if(COL) std::swap(r,c);
+				v=t.val;
+				rec[r].push_back({c,v});
+				if((int)rec[r].size()==1){
+					ids.push_back(r);
+				}
+			}
+			int K=ids.size();
+			nzId=(int*)malloc(K*sizeof(int));
+			offset=(int*)malloc((K+1)*sizeof(int));
+			e=(int*)malloc(U*sizeof(int));
+			w=(double*)malloc(U*sizeof(double));
+			for(int i=0;i<K;i++) nzId[i]=ids[i];
+			int cur=0;
+			for(int i=0;i<K;i++){
+				offset[i]=cur;
+				for(auto&[c,v]:rec[nzId[i]]){
+					e[cur]=c;
+					w[cur]=v;
+					cur++;
+				}
+			}
+			offset[K]=cur;
+			assert(cur==U);
+		}
+		else{
+			std::vector<int>ids;
+			for(int i=0;i<U;i++){
+				tpl t=m.coo[i];
+				r=t.r;
+				c=t.c;
+				if(COL) std::swap(r,c);
+				v=t.val;
+				rec[r].push_back({c,v});
+			}
+			offset=(int*)malloc((R+1)*sizeof(int));
+			e=(int*)malloc(U*sizeof(int));
+			w=(double*)malloc(U*sizeof(double));
+			int cur=0;
+			for(int i=0;i<R;i++){
+				offset[i]=cur;
+				for(auto&[c,v]:rec[i]){
+					e[cur]=c;
+					w[cur]=v;
+					cur++;
+				}
+			}
+			offset[R]=cur;
+			assert(cur==U);
+		}
+		nnz=U;
+	}
+};
+
+template<typename T>
+struct KV{
+	int k;
+	T v;
+};
+
+
+using ull=unsigned long long;
+int a[1000];
+int find(int x){
+	int l=0,r=999,mid;
+	while(l<=r){
+		mid=(l+r)>>1;
+		if(a[mid]==x) {
+			return mid;
+		}
+		else if(a[mid]>x){
+			r=mid-1;
+		}
+		else{ 
+			l=mid+1;
+		}
+	}
+	return -1;
+}
+int main(){
+	for(int i=0;i<1000;i++){
+		a[i]=2*i;
+	}
+	int x;cin>>x;
+	cout<<find(x);
+	return 0;
+}
\ No newline at end of file
diff --git a/timedef.h b/timedef.h
new file mode 100644
index 0000000..21994a6
--- /dev/null
+++ b/timedef.h
@@ -0,0 +1,5 @@
+#pragma once
+#ifndef __linux__
+#include <windows.h>
+#define time GetTickCount
+#endif
\ No newline at end of file