From 7d4b8b66415709d996061a6311ea2d6fdba78cf5 Mon Sep 17 00:00:00 2001
From: Adhhitha Dias <kadhitha@purdue.edu>
Date: Mon, 28 Jun 2021 17:36:53 -0400
Subject: [PATCH] minimal changes to support ispc exec

---
 .gitignore                     |  3 ++
 CMakeLists.txt                 |  7 ++++
 include/taco/cuda.h            | 10 +++++
 include/taco/version.h.in      |  1 +
 src/codegen/codegen.cpp        |  4 ++
 src/codegen/codegen_ispc.h     |  4 +-
 src/codegen/module.cpp         |  7 ++++
 src/cuda.cpp                   | 11 ++++++
 test/tests-scheduling-eval.cpp | 70 +++++++++++++++++++++++++++++++++-
 tools/taco.cpp                 | 19 +++++++++
 10 files changed, 132 insertions(+), 4 deletions(-)
diff --git a/.gitignore b/.gitignore
index 16389f34e..9abc3adc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@ CMakeCache.txt
 doc
 
 apps/tensor_times_vector/tensor_times_vector
+
+.cache
+compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6a80d9d1..7e9359e01 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,12 @@ project(taco
   LANGUAGES C CXX
 )
 option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
+option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF)
 option(PYTHON "Build TACO for python environment" OFF)
 option(OPENMP "Build with OpenMP execution support" OFF)
 option(COVERAGE "Build with code coverage analysis" OFF)
 set(TACO_FEATURE_CUDA 0)
+set(TACO_FEATURE_ISPC 0)
 set(TACO_FEATURE_OPENMP 0)
 set(TACO_FEATURE_PYTHON 0)
 if(CUDA)
@@ -22,6 +24,11 @@ if(CUDA)
   add_definitions(-DCUDA_BUILT)
   set(TACO_FEATURE_CUDA 1)
 endif(CUDA)
+if(ISPC)
+  message("-- Searching for ISPC Installation")
+  add_definitions(-DISPC_BUILT)
+  set(TACO_FEATURE_ISPC 1)
+endif(ISPC) 
 if(OPENMP)
   message("-- Will use OpenMP for parallel execution")
   add_definitions(-DUSE_OPENMP)
diff --git a/include/taco/cuda.h b/include/taco/cuda.h
index aad6b5229..7ed545c6d 100644
--- a/include/taco/cuda.h
+++ b/include/taco/cuda.h
@@ -9,7 +9,17 @@
   #define CUDA_BUILT false
 #endif
 
+#ifndef ISPC_BUILT
+  #define ISPC_BUILT false
+#endif
+
 namespace taco {
+
+/// Functions used by taco to interface with ISPC
+bool should_use_ISPC_codegen();
+void set_ISPC_codegen_enabled(bool enabled);
+
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 /// Check if should use CUDA codegen
 bool should_use_CUDA_codegen();
diff --git a/include/taco/version.h.in b/include/taco/version.h.in
index bc5559d7d..8ef507598 100644
--- a/include/taco/version.h.in
+++ b/include/taco/version.h.in
@@ -20,5 +20,6 @@
 #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@
 #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@
 #define TACO_FEATURE_CUDA   @TACO_FEATURE_CUDA@
+#define TACO_FEATURE_ISPC   @TACO_FEATURE_ISPC@
 
 #endif /* TACO_VERSION_H */
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
index f0c09d98a..f57f9950f 100644
--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -2,6 +2,7 @@
 #include "taco/cuda.h"
 #include "codegen_cuda.h"
 #include "codegen_c.h"
+#include "codegen_ispc.h"
 #include <algorithm>
 #include <unordered_set>
 
@@ -26,6 +27,9 @@ shared_ptr<CodeGen> CodeGen::init_default(std::ostream &dest, OutputKind outputK
   if (should_use_CUDA_codegen()) {
     return make_shared<CodeGen_CUDA>(dest, outputKind);
   }
+  else if (should_use_ISPC_codegen()) {
+    return make_shared<CodeGen_ISPC>(dest, outputKind);
+  }
   else {
     return make_shared<CodeGen_C>(dest, outputKind);
   }
diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h
index e3c87ece5..35da5a01b 100644
--- a/src/codegen/codegen_ispc.h
+++ b/src/codegen/codegen_ispc.h
@@ -1,5 +1,5 @@
-#ifndef TACO_BACKEND_C_H
-#define TACO_BACKEND_C_H
+#ifndef TACO_BACKEND_ISPC_H
+#define TACO_BACKEND_ISPC_H
 #include <map>
 #include <vector>
 
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index bd0f487b1..409ed4a83 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -13,6 +13,7 @@
 #include "taco/util/strings.h"
 #include "taco/util/env.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "taco/cuda.h"
 
@@ -89,6 +90,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
     if (should_use_CUDA_codegen()) {
       CodeGen_CUDA::generateShim(func, shims);
     }
+    else if (should_use_ISPC_codegen()) {
+      CodeGen_ISPC::generateShim(func, shims);
+    }
     else {
       CodeGen_C::generateShim(func, shims);
     }
@@ -98,6 +102,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
   if (should_use_CUDA_codegen()) {
     shims_file.open(path+prefix+"_shims.cpp");
   }
+  else if (should_use_ISPC_codegen()) {
+    shims_file.open(path+prefix+".ispc", ios::app);
+  }
   else {
     shims_file.open(path+prefix+".c", ios::app);
   }
diff --git a/src/cuda.cpp b/src/cuda.cpp
index 059c60105..85139f874 100644
--- a/src/cuda.cpp
+++ b/src/cuda.cpp
@@ -7,6 +7,17 @@
 
 using namespace std;
 namespace taco {
+
+static bool ISPC_codegen_enabled = ISPC_BUILT;
+bool should_use_ISPC_codegen() {
+  return ISPC_codegen_enabled;
+}
+
+void set_ISPC_codegen_enabled(bool enabled) {
+  ISPC_codegen_enabled = enabled;
+}
+
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 static bool CUDA_codegen_enabled = CUDA_BUILT;
 static bool CUDA_unified_memory_enabled = CUDA_BUILT;
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 52bd74ab4..f59359081 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -1,5 +1,7 @@
+#include <iostream>
 #include <taco/index_notation/transformations.h>
 #include <codegen/codegen_c.h>
+#include <codegen/codegen_ispc.h>
 #include <codegen/codegen_cuda.h>
 #include <fstream>
 #include "test.h"
@@ -44,6 +46,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
           .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+  // return stmt.split(i, i0, i1, CHUNK_SIZE)
+  //         .reorder({i0, i1, j})
+  //         .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -1463,7 +1473,63 @@ TEST(scheduling_eval, mttkrpGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(generate_evaluation_files, DISABLED_cpu) {
+
+
+TEST(generate_ispc_evaluation_files, ispc) {
+  std::cout << "Hi Adhitha!\n" << std::endl ;
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  vector<vector<int>> spmv_parameters = {{32}};
+  vector<vector<int>> spmspv_parameters = {{8}};
+
+  // 4 to 512 and 4, 8, 16
+  vector<vector<int>> spmm_dcsr_parameters = {{16, 8}};
+  vector<vector<int>> spmm_parameters = {{16,4}};
+
+  vector<vector<int>> mttkrp_parameters = {};
+  mttkrp_parameters.push_back({64,0});
+
+  vector<vector<int>> sddmm_parameters = {{8, 8}};
+  vector<vector<int>> ttv_parameters = {{32}};
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+
+  string file_ending = ".ispc";
+  string file_path = "eval_prepared_ispc/";
+  mkdir(file_path.c_str(), 0777);
+
+  // spmv
+  {
+    stringstream source;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> x("x", {NUM_J}, {Dense});
+    Tensor<double> y("y", {NUM_I}, {Dense});
+    y(i) = A(i, j) * x(j);
+    std::cout << "concretizing the assignment statement\n";
+    IndexStmt stmt = y.getAssignment().concretize();
+    std::cout << "Printing the original IndexStmt: " << stmt << std::endl;
+    for (auto paramSet : spmv_parameters) {
+      std::cout << "param set: " << paramSet[0] << std::endl;
+      IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]);
+      std::cout << "scheduled IndexStmt: " << scheduled << std::endl;
+      ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco",  false, true);
+      std::cout << "computed statement: \n" << compute << std::endl;
+      codegen->compile(compute, false);
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmv_csr_ispc_taco.h");
+    source_file << source.str();
+    source_file.close();
+  }
+
+
+  return;
+}
+
+TEST(generate_evaluation_files, cpu) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -1779,7 +1845,7 @@ TEST(generate_evaluation_files, DISABLED_cpu) {
   }
 }
 
-TEST(generate_evaluation_files, DISABLED_gpu) {
+TEST(generate_evaluation_files, gpu) {
   if (!should_use_CUDA_codegen()) {
     return;
   }
diff --git a/tools/taco.cpp b/tools/taco.cpp
index cd351a203..ce03b61e1 100644
--- a/tools/taco.cpp
+++ b/tools/taco.cpp
@@ -20,6 +20,7 @@
 #include "taco/lower/lower.h"
 #include "taco/codegen/module.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "codegen/codegen.h"
 #include "taco/util/strings.h"
@@ -188,6 +189,8 @@ static void printUsageInfo() {
   cout << endl;
   printFlag("print-nocolor", "Print without colors.");
   cout << endl;
+  printFlag("ispc", "Generate ISPC code for Intel CPUs");
+  cout << endl;
   printFlag("cuda", "Generate CUDA code for NVIDIA GPUs");
   cout << endl;
   printFlag("schedule", "Specify parallel execution schedule");
@@ -279,6 +282,8 @@ static void printVersionInfo() {
     cout << "Built with Python support." << endl;
   if(TACO_FEATURE_CUDA)
     cout << "Built with CUDA support." << endl;
+  if(TACO_FEATURE_ISPC)
+    cout << "Built with ISPC support." << endl;
   cout << endl;
   cout << "Built on: " << TACO_BUILD_DATE << endl;
   cout << "CMake build type: " << TACO_BUILD_TYPE << endl;
@@ -641,6 +646,7 @@ int main(int argc, char* argv[]) {
   bool color               = true;
   bool readKernels         = false;
   bool cuda                = false;
+  bool ispc                = false;
 
   bool setSchedule         = false;
 
@@ -949,6 +955,10 @@ int main(int argc, char* argv[]) {
     else if ("-cuda" == argName) {
       cuda = true;
     }
+    else if ("-ispc" == argName) {
+      std::cout << "ispc true\n";
+      ispc = true;
+    }
     else if ("-schedule" == argName) {
       vector<string> descriptor = util::split(argValue, ",");
       if (descriptor.size() > 2 || descriptor.empty()) {
@@ -1129,9 +1139,18 @@ int main(int argc, char* argv[]) {
       return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2);
     }
     set_CUDA_codegen_enabled(true);
+    set_ISPC_codegen_enabled(false);
+  }
+  else if (ispc) {
+    if (!ISPC_BUILT && benchmark) {
+      return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2);
+    }
+    set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(true);
   }
   else {
     set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(false);
   }
 
   stmt = scalarPromote(stmt);