forked from MatthewRHermes/mrh
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
874 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pm*.cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
|
||
# Definition of MACROS | ||
|
||
BASE_LIBGPU=../../src | ||
|
||
BINROOT=./ | ||
EXE=a.out | ||
SHELL=/bin/sh | ||
CXX = g++ | ||
CXXFLAGS= | ||
FC=gfortran | ||
FCFLAGS= | ||
LD = $(CXX) | ||
LDFLAGS = -fPIC -shared | ||
AR = ar rcs | ||
CPP = cpp -P -traditional | ||
INSTALL=../ | ||
|
||
CUDA_CXX = nvcc | ||
CUDA_CXXFLAGS = | ||
|
||
ARCH ?= polaris-gnu-nvcc | ||
include $(BASE_LIBGPU)/arch/$(ARCH) | ||
|
||
CXXFLAGS += -I$(BASE_LIBGPU) | ||
CUDA_CXXFLAGS += -I$(BASE_LIBGPU) | ||
|
||
$(info ARCH is [${ARCH}]) | ||
|
||
# -- subset of src files with cuda kernels | ||
CUDA_SRC = pm_cuda.cpp offload.cpp | ||
CUDA_OBJ = $(CUDA_SRC:.cpp=.o) | ||
|
||
CSRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp)) | ||
INC = $(wildcard *.h) | ||
COBJ = $(CSRC:.cpp=.o) | ||
|
||
FSRC = $(wildcard *.F) | ||
MOD = $(FSRC:.F=.mod) | ||
FOBJ = $(FSRC:.F=.o) | ||
|
||
# -- only copy source files; headers referenced with compiler flag | ||
$(shell cp ../../src/pm*.cpp ./) | ||
|
||
# | ||
# -- target : Dependencies | ||
# -- Rule to create target | ||
|
||
$(EXE): $(COBJ) $(CUDA_OBJ) $(FOBJ) $(MOD) | ||
$(LD) $(LDFLAGS) -o $@ $(COBJ) $(CUDA_OBJ) $(LIB) | ||
|
||
install: $(EXE) | ||
cp $(EXE) $(INSTALL) | ||
# cp $(MOD) $(FOBJ) $(INSTALL)/include | ||
|
||
#################################################################### | ||
|
||
$(COBJ): %.o: %.cpp | ||
$(CXX) $(CXXFLAGS) -c $< | ||
|
||
$(FOBJ): %.o: %.F90 | ||
$(FC) $(FCFLAGS) -c $< | ||
|
||
$(MOD): %.mod: %.F90 | ||
$(FC) $(FCFLAGS) -c $< | ||
|
||
$(CUDA_OBJ): %.o: %.cpp | ||
$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@ | ||
|
||
# | ||
# -- Remove *.o and *~ from the directory | ||
clean: | ||
rm -f *.o *.mod *~ ./$(EXE) | ||
rm -f $(INSTALL)/$(EXE) | ||
rm -rf $(EXE).dSYM | ||
# | ||
# -- Remove *.o, *~, and executable from the directory | ||
realclean: | ||
rm -f *.o *.mod *~ ./$(EXE) | ||
rm -f $(INSTALL)/$(EXE) | ||
rm -rf $(EXE).dSYM | ||
rm -f *.optrpt | ||
rm -f pm*.h pm*.cpp | ||
|
||
# | ||
# -- Simple dependencies |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,307 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <math.h> | ||
#include <iostream> | ||
#include <cassert> | ||
|
||
#include <mpi.h> | ||
#include <omp.h> | ||
|
||
#include "pm.h" | ||
|
||
#define _NUM_BATCHES 100 | ||
|
||
#define _NUM_ROWS_A 4 | ||
#define _NUM_COLS_A 3 | ||
|
||
#define _NUM_ROWS_B _NUM_COLS_A | ||
#define _NUM_COLS_B 5 | ||
|
||
#define _TOL 1e-8 | ||
#define _NUM_ITERATIONS_CPU 10 | ||
#define _NUM_ITERATIONS_GPU 1000 | ||
|
||
#ifdef _SINGLE_PRECISION | ||
typedef float real_t; | ||
#else | ||
typedef double real_t; | ||
#endif | ||
|
||
using namespace PM_NS; | ||
|
||
extern void init_pm(class PM *); | ||
|
||
// A is (m, k) matrix | ||
// B is (k, n) matrix | ||
// C is (m, n) matrix | ||
|
||
// Column-ordering transposes everything | ||
// To compute A.B, then to call API with B.A | ||
|
||
extern "C" { | ||
void dsymm_(const char*, const char*, const int*, const int*, | ||
const double*, const double*, const int*, | ||
const double*, const int*, | ||
const double*, double*, const int*); | ||
|
||
void dgemm_(const char * transa, const char * transb, const int * m, const int * n, | ||
const int * k, const double * alpha, const double * a, const int * lda, | ||
const double * b, const int * ldb, const double * beta, double * c, | ||
const int * ldc); | ||
} | ||
|
||
// extern void transpose(real_t *, real_t *, const int, const int); | ||
// extern void copy_naive_gpu(real_t *, real_t *, const int, const int); | ||
// extern void transpose_naive_gpu(real_t *, real_t *, const int, const int); | ||
// extern void transpose_gpu_v1(real_t *, real_t *, const int, const int); | ||
// extern void transpose_gpu_v2(real_t *, real_t *, const int, const int); | ||
// extern void transpose_gpu_v3(real_t *, real_t *, const int, const int); | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
void gemm_NN0_naive_cpu(const int * m_, const int * n_, const int * k_, const real_t * alpha_, | ||
real_t * a, const int * lda_, real_t * b, const int * ldb_, | ||
const real_t * beta_, real_t * c, const int * ldc_) | ||
{ | ||
double alpha = *alpha_; | ||
double beta = *beta_; | ||
|
||
int m = *m_; | ||
int n = *n_; | ||
int k = *k_; | ||
|
||
int lda = *lda_; | ||
int ldb = *ldb_; | ||
int ldc = *ldc_; | ||
|
||
for(int i=0; i<m; ++i) | ||
for(int j=0; j<n; ++j) { | ||
double val = 0.0; | ||
for(int l=0; l<k; ++l) val += a[i*lda+l] * b[l*ldb+j]; | ||
c[i*ldc+j] = alpha * val + beta * c[i*ldc+j]; | ||
} | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
int check_result(real_t * ref, real_t * test, int n, const char * name) | ||
{ | ||
int err = 0; | ||
for(int i=0; i<n; ++i) { | ||
real_t diff = (ref[i] - test[i]) * (ref[i] - test[i]); | ||
// printf(" -- i= %i ref= %f test= %f diff= %f\n",i,ref[i],test[i],diff); | ||
if(diff > _TOL) err++; | ||
} | ||
|
||
if(err == 0) printf("Results from %s are correct!! :) \n", name); | ||
else printf("Results from %s are incorrect!! :( \n", name); | ||
|
||
return err; | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
void print_matrix(real_t * data, int num_rows, int num_cols, const char * name) | ||
{ | ||
printf("\nMatrix[%s] : %i x %i \n",name, num_rows, num_cols); | ||
for(int i=0; i<num_rows; ++i) { | ||
for(int j=0; j<num_cols; ++j) printf(" %f", data[i*num_cols + j]); | ||
printf("\n"); | ||
} | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
void print_summary(double t, int num_rows_a, int num_cols_a, int num_cols_b, int num_iter, const char * name) | ||
{ | ||
double flop = 2.0 * num_rows_a * num_cols_a * num_cols_b / 1024.0 / 1024.0 / 1024.0; // GFlop | ||
|
||
double flops = flop * num_iter / t; // GFlop/s | ||
|
||
printf("\nMatrix[%s] : mnk= %i %i %i num_iter= %i time= %f [ms] flops= %f [GB/s]\n", | ||
name, num_rows_a, num_cols_b, num_cols_a, num_iter, t*1000.0, flops); | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
int main( int argc, char* argv[] ) | ||
{ | ||
MPI_Init(&argc, &argv); | ||
|
||
int me,nranks; | ||
MPI_Comm_size(MPI_COMM_WORLD, &nranks); | ||
MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||
|
||
#ifdef _SINGLE_PRECISION | ||
if(me == 0) printf("Using single-precision\n\n"); | ||
#else | ||
if(me == 0) printf("Using double-precision\n\n"); | ||
#endif | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
class PM * pm = new PM(); | ||
|
||
init_pm(pm); | ||
|
||
int num_devices = pm->dev_num_devices(); | ||
|
||
if(me == 0) { | ||
printf("\n# of devices= %i\n",num_devices); | ||
pm->dev_properties(num_devices); | ||
} | ||
|
||
// Device ID | ||
|
||
int device_id = me % num_devices; | ||
|
||
pm->dev_set_device(device_id); | ||
|
||
for(int i=0; i<nranks; ++i) { | ||
if(i == me) { | ||
printf("Rank %i running on GPU %i!\n",me,device_id); | ||
} | ||
MPI_Barrier(MPI_COMM_WORLD); | ||
} | ||
|
||
real_t * a = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t)); | ||
real_t * b = (real_t*) malloc(_NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t)); | ||
|
||
real_t * c = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); | ||
real_t * r = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); | ||
|
||
// Initialize host | ||
|
||
for(int i=0; i<_NUM_ROWS_A; ++i) { | ||
for(int j=0; j<_NUM_COLS_A; ++j) { | ||
a[i*_NUM_COLS_A + j] = (i * _NUM_COLS_A + j) * 0.1; | ||
} | ||
} | ||
|
||
for(int i=0; i<_NUM_ROWS_B; ++i) { | ||
for(int j=0; j<_NUM_COLS_B; ++j) { | ||
b[i*_NUM_COLS_B + j] = (i * _NUM_COLS_B + j) * 0.1; | ||
} | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
// Naive CPU reference: Matrix Multiply | ||
// ---------------------------------------------------------------- | ||
|
||
printf("\nMatrix Multiplication :: C(%i x %i) = A(%i x %i).B(%i x %i)\n", | ||
_NUM_ROWS_A, _NUM_COLS_B, _NUM_ROWS_A, _NUM_COLS_A, _NUM_ROWS_B, _NUM_COLS_B); | ||
|
||
for(int i=0; i<_NUM_ROWS_A; ++i) | ||
for(int j=0; j<_NUM_COLS_B; ++j) | ||
{ | ||
double val = 0.0; | ||
for(int k=0; k<_NUM_COLS_A; ++k) val += a[i*_NUM_COLS_A+k] * b[k*_NUM_COLS_B+j]; | ||
r[i*_NUM_COLS_B+j] = val; | ||
} | ||
|
||
double t; | ||
|
||
{ | ||
const double alpha = 1.0; | ||
const double beta = 0.0; | ||
|
||
const int m = _NUM_ROWS_A; // # rows of first matrix A | ||
const int n = _NUM_COLS_B; // # cols of second matrix B | ||
const int k = _NUM_COLS_A; // # cols of first matrix A | ||
|
||
const int lda = _NUM_COLS_A; // lead dimension of first matrix A | ||
const int ldb = _NUM_COLS_B; // lead dimension of second matrix B | ||
const int ldc = _NUM_COLS_B; // lead dimension of result matrix C | ||
|
||
gemm_NN0_naive_cpu(&m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); | ||
|
||
double t0 = MPI_Wtime(); | ||
for(int i=0; i<_NUM_ITERATIONS_CPU; ++i) | ||
gemm_NN0_naive_cpu(&m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); | ||
t = MPI_Wtime() - t0; | ||
} | ||
|
||
print_summary(t, _NUM_ROWS_A, _NUM_COLS_A, _NUM_COLS_B, _NUM_ITERATIONS_CPU, "gemm_NN0_naive_cpu"); | ||
|
||
check_result(r, c, _NUM_ROWS_A*_NUM_COLS_B, "naive_cpu"); | ||
|
||
// print_matrix(a, _NUM_ROWS_A, _NUM_COLS_A, "Original a"); | ||
|
||
// print_matrix(b, _NUM_ROWS_B, _NUM_COLS_B, "Original b"); | ||
|
||
// print_matrix(r, _NUM_ROWS_A, _NUM_COLS_B, "Reference r"); | ||
|
||
// print_matrix(c, _NUM_ROWS_A, _NUM_COLS_B, "Output c"); | ||
|
||
// ---------------------------------------------------------------- | ||
// Optimized math library: Matrix Multiply | ||
// ---------------------------------------------------------------- | ||
|
||
// Create device buffers and transfer data to device | ||
|
||
real_t * d_a = (real_t *) pm->dev_malloc(_NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t)); | ||
real_t * d_b = (real_t *) pm->dev_malloc(_NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t)); | ||
real_t * d_c = (real_t *) pm->dev_malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); | ||
|
||
pm->dev_push(d_a, a, _NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t)); | ||
pm->dev_push(d_b, b, _NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t)); | ||
|
||
{ | ||
const double alpha = 1.0; | ||
const double beta = 0.0; | ||
|
||
const int m = _NUM_COLS_B; // # rows of first matrix B^T | ||
const int n = _NUM_ROWS_A; // # cols of second matrix A^T | ||
const int k = _NUM_ROWS_B; // # cols of first matrix B^T | ||
|
||
const int ldb = _NUM_COLS_B; // lead dimension of first matrix B^T | ||
const int lda = _NUM_COLS_A; // lead dimension of second matrix A^T | ||
const int ldc = _NUM_COLS_B; // lead dimension of result matrix C^T | ||
|
||
#ifdef _SINGLE_PRECISION | ||
sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); | ||
#else | ||
dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); | ||
#endif | ||
|
||
pm->dev_barrier(); | ||
|
||
double t0 = MPI_Wtime(); | ||
for(int i=0; i<_NUM_ITERATIONS_CPU; ++i) { | ||
#ifdef _SINGLE_PRECISION | ||
sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); | ||
#else | ||
dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); | ||
#endif | ||
} | ||
pm->dev_barrier(); | ||
t = MPI_Wtime() - t0; | ||
} | ||
|
||
pm->dev_pull(d_c, c, _NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); | ||
|
||
print_summary(t, _NUM_ROWS_A, _NUM_COLS_A, _NUM_COLS_B, _NUM_ITERATIONS_CPU, "LAPACK gemm"); | ||
|
||
check_result(r, c, _NUM_ROWS_A*_NUM_COLS_B, "lapack_dgemm_cpu"); | ||
|
||
// print_matrix(r, _NUM_ROWS_A, _NUM_COLS_B, "Reference r"); | ||
|
||
// print_matrix(c, _NUM_ROWS_A, _NUM_COLS_B, "Output c"); | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
// Clean up | ||
|
||
pm->dev_free(d_a); | ||
pm->dev_free(d_b); | ||
pm->dev_free(d_c); | ||
|
||
delete pm; | ||
|
||
free(a); | ||
free(b); | ||
free(c); | ||
free(r); | ||
|
||
MPI_Finalize(); | ||
} |
Oops, something went wrong.