tensor-compiler · hochawa · Oct 4, 2020 · Oct 5, 2020 · Oct 17, 2020 · Nov 19, 2020
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
@@ -230,16 +230,47 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
 }
 
 string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
-                            bool is_output_prop) {
+                            bool is_output_prop, int flag, string output_tensor) {
   stringstream ret;
   ret << "  ";
 
   auto tensor = op->tensor.as<Var>();
   if (op->property == TensorProperty::Values) {
     // for the values, it's in the last slot
-    ret << printType(tensor->type, true);
-    ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
-    ret << tensor->name << "->vals);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        ret << printType(tensor->type, true);
+        ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
+        ret << tensor->name << "->vals);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << "gpuErrchk(cudaMalloc((void **)&";
+        ret << tensor->name << "_dev" << "->vals, ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->vals)));\n";
+
+        ret << "  ";
+        ret << "cudaMemcpy(";
+        ret << tensor->name << "_dev" << "->vals, ";
+        ret << tensor->name << "->vals, ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->vals), ";
+        ret << "cudaMemcpyHostToDevice);\n";
+        break;
+      case PRINT_MEM_DEV_TO_HOST:
+        if(output_tensor == tensor->name) {
+          ret << "cudaMemcpy(";
+          ret << tensor->name << "->vals, ";
+          ret << tensor->name << "_dev->vals, ";
+          ret << "malloc_usable_size(";
+          ret << tensor->name << "->vals), ";
+          ret << "cudaMemcpyDevicetToHost);\n";
+          ret << "  ";
+        }
+        ret << "cudaFree(";
+        ret << tensor->name << "_dev" << "->vals);\n";
+        break;
+    }
     return ret.str();
   } else if (op->property == TensorProperty::ValuesSize) {
     ret << "int " << varname << " = " << tensor->name << "->vals_size;\n";
@@ -252,18 +283,54 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
   // for a Fixed level, ptr is an int
   // all others are int*
   if (op->property == TensorProperty::Dimension) {
-    tp = "int";
-    ret << tp << " " << varname << " = (int)(" << tensor->name
-        << "->dimensions[" << op->mode << "]);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        tp = "int";
+        ret << tp << " " << varname << " = (int)(" << tensor->name
+          << "->dimensions[" << op->mode << "]);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << tensor->name << "_dev->dimension[" << op->mode << "] = " << tensor->name << "->dimension[" << op->mode << "];\n";
+        break;
+    }
   } else {
     taco_iassert(op->property == TensorProperty::Indices);
     tp = "int*";
     auto nm = op->index;
-    ret << tp << " " << restrictKeyword() << " " << varname << " = ";
-    ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
-    ret << "][" << nm << "]);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        ret << tp << " " << restrictKeyword() << " " << varname << " = ";
+        ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
+        ret << "][" << nm << "]);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << "gpuErrchk(cudaMalloc((void **)&";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "])));\n";
+
+        ret << "  ";
+        ret << "cudaMemcpy(";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "]), ";
+        ret << "cudaMemcpyHostToDevice);\n";
+        break;
+      case PRINT_MEM_DEV_TO_HOST:
+        if(output_tensor == tensor->name) {
+          ret << "cudaMemcpy(";
+          ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+          ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+          ret << "malloc_usable_size(";
+          ret << tensor->name << "_dev->indices[" << op->mode << "][" << nm << "]), ";
+          ret << "cudaMemcpyDevicetToHost);\n";
+        }
+        ret << "cudaFree(";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "]);\n";
+        break;
+    }
   }
-
   return ret.str();
 }
 
@@ -312,7 +379,7 @@ string CodeGen::pointTensorProperty(std::string varname) {
 
 // helper to print declarations
 string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
-                           vector<Expr> inputs, vector<Expr> outputs) {
+                           vector<Expr> inputs, vector<Expr> outputs, int flag, string output_tensor) {
   stringstream ret;
   unordered_set<string> propsAlreadyGenerated;
 
@@ -367,7 +434,7 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
         break; 
       }
     } else {
-      ret << unpackTensorProperty(varMap[prop], prop, isOutputProp);
+      ret << unpackTensorProperty(varMap[prop], prop, isOutputProp, flag, output_tensor);
     }
     propsAlreadyGenerated.insert(varMap[prop]);
   }

diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h
@@ -1,6 +1,10 @@
 #ifndef TACO_CODEGEN_H
 #define TACO_CODEGEN_H
 
+#define PRINT_FUNC 0
+#define PRINT_MEM_HOST_TO_DEV 1
+#define PRINT_MEM_DEV_TO_HOST 2
+
 #include <memory>
 #include "taco/ir/ir.h"
 #include "taco/ir/ir_printer.h"
@@ -43,7 +47,7 @@ class CodeGen : public IRPrinter {
                                           std::vector<Expr> localVars, int labels,
                                           std::string funcName);
   std::string printDecls(std::map<Expr, std::string, ExprCompare> varMap,
-                         std::vector<Expr> inputs, std::vector<Expr> outputs);
+                         std::vector<Expr> inputs, std::vector<Expr> outputs, int flag, std::string output_tensor);
   std::string printPack(std::map<std::tuple<Expr, TensorProperty, int, int>,
           std::string> outputProperties, std::vector<Expr> outputs);
   std::string printCoroutineFinish(int numYields, std::string funcName);
@@ -63,7 +67,7 @@ class CodeGen : public IRPrinter {
 
   std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
   std::string unpackTensorProperty(std::string varname, const GetProperty* op,
-                              bool is_output_prop);
+                              bool is_output_prop, int flag, std::string output_tensor);
   std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
                             int mode, int index);
   std::string pointTensorProperty(std::string varname);

diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
@@ -290,7 +290,7 @@ void CodeGen_C::visit(const Function* func) {
   localVars = varFinder.localVars;
 
   // Print variable declarations
-  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;
 
   if (emittingCoroutine) {
     out << printContextDeclAndInit(varMap, localVars, numYields, func->name)

diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
@@ -282,6 +282,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   vector<Stmt> threadFors; // contents is device function
   vector<Stmt> warpFors;
   map<Expr, string, ExprCompare> scopeMap;
+  string output_tensor;
 
   // the variables to pass to each device function
   vector<vector<pair<string, Expr>>> functionParameters;
@@ -312,7 +313,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
       taco_iassert(var) << "Outputs must be vars in codegen";
       taco_iassert(scopeMap.count(var) == 0) <<
                                              "Duplicate output found in codegen";
-
+      output_tensor = var->name; // Isn't there only one output?
       scopeMap[var] = var->name;
     }
   }
@@ -436,10 +437,20 @@ Stmt CodeGen_CUDA::simplifyFunctionBodies(Stmt stmt) {
   return FunctionBodySimplifier().rewrite(stmt);
 }
 
-string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index) {
+string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index, int flag) {
   stringstream ret;
-  ret << "__global__" << endl;
-  ret << "void " << funcName << "DeviceKernel" << index << "(";
+  switch(flag) {
+    case PRINT_FUNC:
+      ret << "__global__" << endl;
+      ret << "void " << funcName << "DeviceKernel" << index << "(";
+      break;
+    case PRINT_MEM_HOST_TO_DEV:
+      ret << "void " << funcName << "MemcpyHostToDev" << index << "(";
+      break;
+    case PRINT_MEM_DEV_TO_HOST:
+      ret << "void " << funcName << "MemcpyDevToHost" << index << "(";
+      break;
+  }
 
   string delimiter = "";
   for (size_t i=0; i<currentParameters.size(); i++) {
@@ -462,7 +473,32 @@ string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> curren
     // No non-tensor parameters
     delimiter = ", ";
   }
+  if(flag == PRINT_MEM_HOST_TO_DEV || flag == PRINT_MEM_DEV_TO_HOST) {
+    ret << ", ";
+    string delimiter = "";
+    for (size_t i=0; i<currentParameters.size(); i++) {
+      auto var = currentParameters[i].second.as<Var>();
+      taco_iassert(var) << "Unable to convert output " << currentParameters[i].second
+                        << " to Var";
+      string varName = currentParameters[i].first;
+
+      if (var->is_tensor) {
+        ret << delimiter << "taco_tensor_t * __restrict__ " << varName << "_dev";
+      }
+      else {
+        auto tp = printCUDAType(var->type, var->is_ptr);
+        ret << delimiter << tp << " ";
+        if (!var->is_ptr) {
+          ret << "&";
+        }
+        ret << var->name;
+      }
+      // No non-tensor parameters
+      delimiter = ", ";
+    }
+  }
   ret << ")";
+
   return ret.str();
 }
 
@@ -574,7 +610,33 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
     emittedTimerStartCode = true;
   }
 
+  // for malloc
+  string delimiter = "";
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+      << " to Var";
+    string varName = currentParameters[i].first;
+    stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n";
+    doIndent();
+  }
+
+  // for MemcpyHostToDev
+  stream << funcName << "MemcpyHostToDev" << index << "(";
+  for (size_t l=0; l<2; l++) {
+    for (size_t i=0; i<currentParameters.size(); i++) {
+      taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+        << " to Var";
+      string varName = currentParameters[i].first;
+      stream << delimiter << varName;
+      if(l == 1) stream << "_dev";
+
+      delimiter = ", ";
+    }
+  }
+  stream << ");\n\n";
+  doIndent();
 
+  // for DeviceKernel
   stream << funcName << "DeviceKernel" << index << "<<<";
   gridSize = ir::simplify(gridSize);
   gridSize.accept(this);
@@ -583,7 +645,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
   stream << ">>>";
   stream << "(";
 
-  string delimiter = "";
+  delimiter = "";
   for (size_t i=0; i<currentParameters.size(); i++) {
     taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
                                                         << " to Var";
@@ -605,8 +667,30 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
     stream << "cudaEventElapsedTime(&tot_ms, event1, event2);\n";
   }
   doIndent();
-  stream << "cudaDeviceSynchronize();\n";
+  stream << "cudaDeviceSynchronize();\n\n";
 
+  // for MemcpyDevToHost
+  doIndent();
+  stream << funcName << "DeviceFree" << index << "(";
+  delimiter = "";
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+      << " to Var";
+    string varName = currentParameters[i].first;
+    stream << delimiter << varName << "_dev";
+
+    delimiter = ", ";
+  }
+  stream << ");\n";
+
+  // for free
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+	  << " to Var";
+    string varName = currentParameters[i].first;
+    doIndent();
+    stream << "free("<< varName << "_dev);\n";
+  }
 }
 
 
@@ -679,12 +763,6 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
       }
     }
 
-    // Generate device function header
-    doIndent();
-    out << printDeviceFuncName(parameters, i);
-    out << "{\n";
-    indent++;
-
     // Generate device function code
     resetUniqueNameCounters();
     vector<Expr> inputs;
@@ -710,8 +788,35 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
     blockloop->accept(&varFinder);
     varMap = varFinder.varMap;
 
+
+
+    // Print MemcpyHostToDev function
+    out << printDeviceFuncName(parameters, i, PRINT_MEM_HOST_TO_DEV);
+    out << "{\n";
+    indent++;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_HOST_TO_DEV, deviceFunctionCollector.output_tensor) << endl;
+    indent--;
+    doIndent();
+    out << "}\n\n";
+
+    // Print MemcpyDevtToHost function
+    out << printDeviceFuncName(parameters, i, PRINT_MEM_DEV_TO_HOST);
+    out << "{\n";
+    indent++;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_DEV_TO_HOST, deviceFunctionCollector.output_tensor) << endl;
+    indent--;
+    doIndent();
+    out << "}\n\n";
+
+
+    // Generate device function header
+    doIndent();
+    out << printDeviceFuncName(parameters, i, PRINT_FUNC);
+    out << "{\n";
+    indent++;
+
     // Print variable declarations
-    out << printDecls(varFinder.varDecls, inputs, {}) << endl;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_FUNC, deviceFunctionCollector.output_tensor) << endl;
     doIndent();
     printBlockIDVariable(deviceFunctionCollector.blockIDVars[i], blockloop->start, blockloop->increment);
     doIndent();
@@ -779,7 +884,7 @@ void CodeGen_CUDA::visit(const Function* func) {
   localVars = varFinder.localVars;
 
   // Print variable declarations
-  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;
 
   if (emittingCoroutine) {
     out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
@@ -1082,7 +1187,7 @@ void CodeGen_CUDA::visit(const Allocate* op) {
   }
 
   doIndent();
-  stream << "gpuErrchk(cudaMallocManaged((void**)&";
+  stream << "gpuErrchk(cudaMalloc((void**)&";
   if (op->is_realloc) {
     stream << variable_name;
   }

diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
@@ -48,7 +48,7 @@ class CodeGen_CUDA : public CodeGen {
   void visit(const Assign*);
   void visit(const Break*);
   void visit(const Free* op);
-  std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index);
+  std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index, int flag);
   void printDeviceFuncCall(const std::vector<std::pair<std::string, Expr>> currentParameters, Expr blockSize, int index, Expr gridSize);
   void printThreadIDVariable(std::pair<std::string, Expr> threadIDVar, Expr start, Expr increment, Expr numThreads);
   void printBlockIDVariable(std::pair<std::string, Expr> blockIDVar, Expr start, Expr increment);