Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding data copy between host and device #323

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 80 additions & 13 deletions src/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,16 +230,47 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
}

string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
bool is_output_prop) {
bool is_output_prop, int flag, string output_tensor) {
stringstream ret;
ret << " ";

auto tensor = op->tensor.as<Var>();
if (op->property == TensorProperty::Values) {
// for the values, it's in the last slot
ret << printType(tensor->type, true);
ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
ret << tensor->name << "->vals);\n";
switch(flag) {
case PRINT_FUNC:
ret << printType(tensor->type, true);
ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
ret << tensor->name << "->vals);\n";
break;
case PRINT_MEM_HOST_TO_DEV:
ret << "gpuErrchk(cudaMalloc((void **)&";
ret << tensor->name << "_dev" << "->vals, ";
ret << "malloc_usable_size(";
ret << tensor->name << "->vals)));\n";

ret << " ";
ret << "cudaMemcpy(";
ret << tensor->name << "_dev" << "->vals, ";
ret << tensor->name << "->vals, ";
ret << "malloc_usable_size(";
ret << tensor->name << "->vals), ";
ret << "cudaMemcpyHostToDevice);\n";
break;
case PRINT_MEM_DEV_TO_HOST:
if(output_tensor == tensor->name) {
ret << "cudaMemcpy(";
ret << tensor->name << "->vals, ";
ret << tensor->name << "_dev->vals, ";
ret << "malloc_usable_size(";
ret << tensor->name << "->vals), ";
ret << "cudaMemcpyDevicetToHost);\n";
ret << " ";
}
ret << "cudaFree(";
ret << tensor->name << "_dev" << "->vals);\n";
break;
}
return ret.str();
} else if (op->property == TensorProperty::ValuesSize) {
ret << "int " << varname << " = " << tensor->name << "->vals_size;\n";
Expand All @@ -252,18 +283,54 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
// for a Fixed level, ptr is an int
// all others are int*
if (op->property == TensorProperty::Dimension) {
tp = "int";
ret << tp << " " << varname << " = (int)(" << tensor->name
<< "->dimensions[" << op->mode << "]);\n";
switch(flag) {
case PRINT_FUNC:
tp = "int";
ret << tp << " " << varname << " = (int)(" << tensor->name
<< "->dimensions[" << op->mode << "]);\n";
break;
case PRINT_MEM_HOST_TO_DEV:
ret << tensor->name << "_dev->dimension[" << op->mode << "] = " << tensor->name << "->dimension[" << op->mode << "];\n";
break;
}
} else {
taco_iassert(op->property == TensorProperty::Indices);
tp = "int*";
auto nm = op->index;
ret << tp << " " << restrictKeyword() << " " << varname << " = ";
ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
ret << "][" << nm << "]);\n";
switch(flag) {
case PRINT_FUNC:
ret << tp << " " << restrictKeyword() << " " << varname << " = ";
ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
ret << "][" << nm << "]);\n";
break;
case PRINT_MEM_HOST_TO_DEV:
ret << "gpuErrchk(cudaMalloc((void **)&";
ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
ret << "malloc_usable_size(";
ret << tensor->name << "->indices[" << op->mode << "][" << nm << "])));\n";

ret << " ";
ret << "cudaMemcpy(";
ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
ret << "malloc_usable_size(";
ret << tensor->name << "->indices[" << op->mode << "][" << nm << "]), ";
ret << "cudaMemcpyHostToDevice);\n";
break;
case PRINT_MEM_DEV_TO_HOST:
if(output_tensor == tensor->name) {
ret << "cudaMemcpy(";
ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
ret << "malloc_usable_size(";
ret << tensor->name << "_dev->indices[" << op->mode << "][" << nm << "]), ";
ret << "cudaMemcpyDevicetToHost);\n";
}
ret << "cudaFree(";
ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "]);\n";
break;
}
}

return ret.str();
}

Expand Down Expand Up @@ -312,7 +379,7 @@ string CodeGen::pointTensorProperty(std::string varname) {

// helper to print declarations
string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
vector<Expr> inputs, vector<Expr> outputs) {
vector<Expr> inputs, vector<Expr> outputs, int flag, string output_tensor) {
stringstream ret;
unordered_set<string> propsAlreadyGenerated;

Expand Down Expand Up @@ -367,7 +434,7 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
break;
}
} else {
ret << unpackTensorProperty(varMap[prop], prop, isOutputProp);
ret << unpackTensorProperty(varMap[prop], prop, isOutputProp, flag, output_tensor);
}
propsAlreadyGenerated.insert(varMap[prop]);
}
Expand Down
8 changes: 6 additions & 2 deletions src/codegen/codegen.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#ifndef TACO_CODEGEN_H
#define TACO_CODEGEN_H

#define PRINT_FUNC 0
#define PRINT_MEM_HOST_TO_DEV 1
#define PRINT_MEM_DEV_TO_HOST 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should replace these with an enum.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see. I will replace them with an enum.


#include <memory>
#include "taco/ir/ir.h"
#include "taco/ir/ir_printer.h"
Expand Down Expand Up @@ -43,7 +47,7 @@ class CodeGen : public IRPrinter {
std::vector<Expr> localVars, int labels,
std::string funcName);
std::string printDecls(std::map<Expr, std::string, ExprCompare> varMap,
std::vector<Expr> inputs, std::vector<Expr> outputs);
std::vector<Expr> inputs, std::vector<Expr> outputs, int flag, std::string output_tensor);
std::string printPack(std::map<std::tuple<Expr, TensorProperty, int, int>,
std::string> outputProperties, std::vector<Expr> outputs);
std::string printCoroutineFinish(int numYields, std::string funcName);
Expand All @@ -63,7 +67,7 @@ class CodeGen : public IRPrinter {

std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
std::string unpackTensorProperty(std::string varname, const GetProperty* op,
bool is_output_prop);
bool is_output_prop, int flag, std::string output_tensor);
std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
int mode, int index);
std::string pointTensorProperty(std::string varname);
Expand Down
2 changes: 1 addition & 1 deletion src/codegen/codegen_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void CodeGen_C::visit(const Function* func) {
localVars = varFinder.localVars;

// Print variable declarations
out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;

if (emittingCoroutine) {
out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
Expand Down
135 changes: 120 additions & 15 deletions src/codegen/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
vector<Stmt> threadFors; // contents is device function
vector<Stmt> warpFors;
map<Expr, string, ExprCompare> scopeMap;
string output_tensor;

// the variables to pass to each device function
vector<vector<pair<string, Expr>>> functionParameters;
Expand Down Expand Up @@ -312,7 +313,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
taco_iassert(var) << "Outputs must be vars in codegen";
taco_iassert(scopeMap.count(var) == 0) <<
"Duplicate output found in codegen";

output_tensor = var->name; // Isn't there only one output?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code generator was designed with the thought that we might eventually support kernels with multiple outputs, though that's not something we've actually fully implemented. It's probably fine to assume for now that there's only one output, though you should probably add an assertion to check for that (e.g., taco_iassert(outputs.size() == 1)) .

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Thanks for explanation. I know understand why the code looks like that. I will add an assertion to check there is only one output.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your time to take a look at the parts I modified!

scopeMap[var] = var->name;
}
}
Expand Down Expand Up @@ -436,10 +437,20 @@ Stmt CodeGen_CUDA::simplifyFunctionBodies(Stmt stmt) {
return FunctionBodySimplifier().rewrite(stmt);
}

string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index) {
string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index, int flag) {
stringstream ret;
ret << "__global__" << endl;
ret << "void " << funcName << "DeviceKernel" << index << "(";
switch(flag) {
case PRINT_FUNC:
ret << "__global__" << endl;
ret << "void " << funcName << "DeviceKernel" << index << "(";
break;
case PRINT_MEM_HOST_TO_DEV:
ret << "void " << funcName << "MemcpyHostToDev" << index << "(";
break;
case PRINT_MEM_DEV_TO_HOST:
ret << "void " << funcName << "MemcpyDevToHost" << index << "(";
break;
}

string delimiter = "";
for (size_t i=0; i<currentParameters.size(); i++) {
Expand All @@ -462,7 +473,32 @@ string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> curren
// No non-tensor parameters
delimiter = ", ";
}
if(flag == PRINT_MEM_HOST_TO_DEV || flag == PRINT_MEM_DEV_TO_HOST) {
ret << ", ";
string delimiter = "";
for (size_t i=0; i<currentParameters.size(); i++) {
auto var = currentParameters[i].second.as<Var>();
taco_iassert(var) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
string varName = currentParameters[i].first;

if (var->is_tensor) {
ret << delimiter << "taco_tensor_t * __restrict__ " << varName << "_dev";
}
else {
auto tp = printCUDAType(var->type, var->is_ptr);
ret << delimiter << tp << " ";
if (!var->is_ptr) {
ret << "&";
}
ret << var->name;
}
// No non-tensor parameters
delimiter = ", ";
}
}
ret << ")";

return ret.str();
}

Expand Down Expand Up @@ -574,7 +610,33 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
emittedTimerStartCode = true;
}

// for malloc
string delimiter = "";
for (size_t i=0; i<currentParameters.size(); i++) {
taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
string varName = currentParameters[i].first;
stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n";
doIndent();
}

// for MemcpyHostToDev
stream << funcName << "MemcpyHostToDev" << index << "(";
for (size_t l=0; l<2; l++) {
for (size_t i=0; i<currentParameters.size(); i++) {
taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
string varName = currentParameters[i].first;
stream << delimiter << varName;
if(l == 1) stream << "_dev";

delimiter = ", ";
}
}
stream << ");\n\n";
doIndent();

// for DeviceKernel
stream << funcName << "DeviceKernel" << index << "<<<";
gridSize = ir::simplify(gridSize);
gridSize.accept(this);
Expand All @@ -583,7 +645,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
stream << ">>>";
stream << "(";

string delimiter = "";
delimiter = "";
for (size_t i=0; i<currentParameters.size(); i++) {
taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
Expand All @@ -605,8 +667,30 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
stream << "cudaEventElapsedTime(&tot_ms, event1, event2);\n";
}
doIndent();
stream << "cudaDeviceSynchronize();\n";
stream << "cudaDeviceSynchronize();\n\n";

// for MemcpyDevToHost
doIndent();
stream << funcName << "DeviceFree" << index << "(";
delimiter = "";
for (size_t i=0; i<currentParameters.size(); i++) {
taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
string varName = currentParameters[i].first;
stream << delimiter << varName << "_dev";

delimiter = ", ";
}
stream << ");\n";

// for free
for (size_t i=0; i<currentParameters.size(); i++) {
taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
<< " to Var";
string varName = currentParameters[i].first;
doIndent();
stream << "free("<< varName << "_dev);\n";
}
}


Expand Down Expand Up @@ -679,12 +763,6 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
}
}

// Generate device function header
doIndent();
out << printDeviceFuncName(parameters, i);
out << "{\n";
indent++;

// Generate device function code
resetUniqueNameCounters();
vector<Expr> inputs;
Expand All @@ -710,8 +788,35 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
blockloop->accept(&varFinder);
varMap = varFinder.varMap;



// Print MemcpyHostToDev function
out << printDeviceFuncName(parameters, i, PRINT_MEM_HOST_TO_DEV);
out << "{\n";
indent++;
out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_HOST_TO_DEV, deviceFunctionCollector.output_tensor) << endl;
indent--;
doIndent();
out << "}\n\n";

// Print MemcpyDevtToHost function
out << printDeviceFuncName(parameters, i, PRINT_MEM_DEV_TO_HOST);
out << "{\n";
indent++;
out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_DEV_TO_HOST, deviceFunctionCollector.output_tensor) << endl;
indent--;
doIndent();
out << "}\n\n";


// Generate device function header
doIndent();
out << printDeviceFuncName(parameters, i, PRINT_FUNC);
out << "{\n";
indent++;

// Print variable declarations
out << printDecls(varFinder.varDecls, inputs, {}) << endl;
out << printDecls(varFinder.varDecls, inputs, {}, PRINT_FUNC, deviceFunctionCollector.output_tensor) << endl;
doIndent();
printBlockIDVariable(deviceFunctionCollector.blockIDVars[i], blockloop->start, blockloop->increment);
doIndent();
Expand Down Expand Up @@ -779,7 +884,7 @@ void CodeGen_CUDA::visit(const Function* func) {
localVars = varFinder.localVars;

// Print variable declarations
out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;

if (emittingCoroutine) {
out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
Expand Down Expand Up @@ -1082,7 +1187,7 @@ void CodeGen_CUDA::visit(const Allocate* op) {
}

doIndent();
stream << "gpuErrchk(cudaMallocManaged((void**)&";
stream << "gpuErrchk(cudaMalloc((void**)&";
if (op->is_realloc) {
stream << variable_name;
}
Expand Down
2 changes: 1 addition & 1 deletion src/codegen/codegen_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class CodeGen_CUDA : public CodeGen {
void visit(const Assign*);
void visit(const Break*);
void visit(const Free* op);
std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index);
std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index, int flag);
void printDeviceFuncCall(const std::vector<std::pair<std::string, Expr>> currentParameters, Expr blockSize, int index, Expr gridSize);
void printThreadIDVariable(std::pair<std::string, Expr> threadIDVar, Expr start, Expr increment, Expr numThreads);
void printBlockIDVariable(std::pair<std::string, Expr> blockIDVar, Expr start, Expr increment);
Expand Down