Merge pull request #4 from rai-project/feature/persistent_granularity

Feature/persistent granularity
rai-project · Apr 11, 2018 · 0f2d163 · 0f2d163
2 parents ffd91b2 + 2b744ca
commit 0f2d163
Show file tree

Hide file tree

Showing 19 changed files with 1,320 additions and 261 deletions.
diff --git a/README.md b/README.md
@@ -132,6 +132,7 @@ The server is part of the MXNet build process.
 | UPR_INPUT_MEAN_B                   |                                       | 0                |
 | UPR_ENABLE_MEMORY_PROFILE          |                                       | false            |
 | UPR_ENABLE_CUDA_FREE               |                                       | false            |
+| UPR_SHARING_GRANULARITY            |                                       | model            |
 | --------------------------         | -----------                           | -------------    |
 | UPRD_EVICTION_POLICY               |                                       | LRU              |
 | UPRD_ESTIMATION_RATE               |                                       | 1.0              |

diff --git a/config.mk b/config.mk
@@ -45,7 +45,7 @@ export NVCC = nvcc
 DEV = 0
 
 # whether compile with debug
-DEBUG = 0
+0EBUG = 0
 
 # whether compile with profiler
 USE_PROFILER =
@@ -174,7 +174,7 @@ USE_S3 = 0
 # performance settings
 #----------------------------
 # Use operator tuning
-USE_OPERATOR_TUNING = 1
+USE_OPERATOR_TUNING = 0
 
 # Use gperftools if found
 USE_GPERFTOOLS = 1
@@ -225,7 +225,7 @@ USE_PROFILER=1
 # whether compile with options for MXNet developer
 DEV = 0
 
-DEBUG = 1
+DEBUG = 0
 USE_GLOG=1
 USE_OPERATOR_TUNING = 0
 USE_OPENMP = 0

diff --git a/example/image-classification/predict-cpp/Makefile b/example/image-classification/predict-cpp/Makefile
@@ -30,16 +30,19 @@ CFLAGS+=-DUSE_CUDNN=1
 CFLAGS+=-DMSHADOW_USE_CUDNN=1
 CFLAGS+=-DNO_OPENMP=1
 CFLAGS+=-DUSE_CUDA=1
+CFLAGS+=-DUSE_GPERFTOOLS=1
+CFLAGS+=-Xcompiler -fno-builtin-malloc,-fno-builtin-calloc,-fno-builtin-realloc,-fno-builtin-free 
 CFLAGS+=-DMXNET_USE_CUDA=1
-CFLAGS+= -Xcompiler -finstrument-functions 
+#CFLAGS+= -Xcompiler -finstrument-functions 
 LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so -Xcompiler -finstrument-functions 
+LDFLAGS+=-ltcmalloc
 
 image-classification-predict: image-classification-predict.o
-	nvcc -O3 -o image-classification-predict image-classification-predict.o $(LDFLAGS)
+	nvcc -O3 -g -o image-classification-predict image-classification-predict.o $(LDFLAGS)
 
 image-classification-predict.o: image-classification-predict.cc
 	echo "CFLAGS = " $(CFLAGS)
-	nvcc -O3  -c image-classification-predict.cc $(CFLAGS)
+	nvcc -O3  -g -c image-classification-predict.cc $(CFLAGS)
 
 clean: 
 	rm -f image-classification-predict

diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc
@@ -282,10 +282,10 @@ int main(int argc, char *argv[]) {
   MXSetProfilerState(0);
 
   // // Synset path for your model, you have to modify it
-  //  std::vector<std::string> synset = LoadSynset(synset_file);
+ //  std::vector<std::string> synset = LoadSynset(synset_file);
 
   // // Print Output Data
-  // PrintOutputResult(data, size, synset);
+ // PrintOutputResult(data, size, synset);
 
   return 0;
 }
diff --git a/example/image-classification/predict-cpp/test.sh b/example/image-classification/predict-cpp/test.sh
@@ -11,12 +11,14 @@ export GLOG_logtostderr=1
 
 export UPR_ENABLED=true
 export UPR_CLIENT=1
-# export UPR_INITIALIZE_EAGER=true
+export UPR_INITIALIZE_EAGER=true
 # export UPR_ENABLE_MEMORY_PROFILE=true
 
-UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
-# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
-# UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
+#UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
+#UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
+UPR_MODEL_NAME=vgg16_1.0 ./image-classification-predict
+#UPR_MODEL_NAME=squeezenet_1.0 ./image-classification-predict
+#UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict
 # UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
 # UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &
 # UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &

diff --git a/src/c_api/ipc.cc b/src/c_api/ipc.cc
@@ -27,83 +27,106 @@ std::string server::host_name = "localhost";
 int server::port              = dmlc::GetEnv("PORT", 50051);
 std::string server::address   = fmt::format("{}:{}", host_name, port);
 
+
 static TShape to_shape(Shape shape) {
   auto dim = shape.dim();
   TShape res(dim.begin(), dim.end());
   return res;
 }
 
-static void *get_device_ptr(const Layer &layer) {
-  const auto ipc_handle = layer.ipc_handle();
-  if (ipc_handle == "") {
-    const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", ipc_handle);
+static void *get_device_ptr_offset(const Layer &layer, void *devPtr) {
+  const auto offset = layer.offset();
+  return (void *) (((char *) (devPtr)) + offset);
+}
+
+static void *get_device_ptr(const std::string &handle_bytes) {
+  if (handle_bytes == "") {
+    const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", handle_bytes);
     LOG(FATAL) << msg;
     throw dmlc::Error(msg);
   }
-
   cudaIpcMemHandle_t handle;
-  memcpy((uint8_t *) &handle, ipc_handle.c_str(), sizeof(handle));
+  memcpy((uint8_t *) &handle, handle_bytes.c_str(), sizeof(handle));
 
-  // LOG(INFO) << "get handle = " << handle << "get base64 handle = " << utils::base64_encode(ipc_handle);
+  void *device_ptr = nullptr;
+  CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
+                  fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(handle_bytes)));
 
-    auto name = layer.name();
+  return device_ptr;
+}
 
-    static const std::string arg_prefix("arg:");
-    if (string_starts_with(name, arg_prefix)) {
-      name.erase(0, arg_prefix.size());
-    }
-    static const std::string aux_prefix("aux:");
-    if (string_starts_with(name, aux_prefix)) {
-      name.erase(0, aux_prefix.size());
-    }
+static void *get_device_ptr(const Layer &layer) {
+  auto name             = layer.name();
+  const auto ipc_handle = layer.ipc_handle();
 
-  void *device_ptr;
-  auto span = start_span("cudaIpcOpenMemHandle", span_category_ipc, span_props{{"layer", name}, 
-{"byte_count", std::to_string(layer.byte_count())}});
-  CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
-                  fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(ipc_handle)));
-  stop_span(span);
+  static const std::string arg_prefix("arg:");
+  if (string_starts_with(name, arg_prefix)) {
+    name.erase(0, arg_prefix.size());
+  }
+  static const std::string aux_prefix("aux:");
+  if (string_starts_with(name, aux_prefix)) {
+    name.erase(0, aux_prefix.size());
+  }
 
-  // LOG(INFO) << "get device_ptr = " << device_ptr;
+  auto span       = start_span("cudaIpcOpenMemHandle",
+                         span_category_ipc,
+                         span_props{{"layer", name}, {"byte_count", std::to_string(layer.byte_count())}});
+  auto device_ptr = get_device_ptr(ipc_handle.c_str());
+  stop_span(span);
 
   return device_ptr;
 }
 
-static void to_ndarray(std::vector<NDArray> *arrays, const Layer &layer) {
-  const auto ctx = get_ctx();
-
-  auto span = start_span("to_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
-  defer(stop_span(span));
-
-  const auto shape    = to_shape(layer.shape());
+static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &model_handle) {
+  const auto ctx      = get_ctx();
   const auto dev_mask = ctx.dev_mask();
   const auto dev_id   = ctx.dev_id;
 
-  // LOG(INFO) << "in layer=" << layer.name() << " getting device ptr using ctx = " << ctx;
-
-  auto device_ptr = get_device_ptr(layer);
-
-  auto span_creating =
-      start_span("creating_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
-  defer(stop_span(span_creating));
-
-  TBlob blob(device_ptr, shape, dev_mask, dev_id);
-  arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
-
-  return;
-}
-
-static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &reply) {
-  const auto layers = reply.layer();
+  const auto layers = model_handle.layer();
 
   // LOG(INFO) << "got " << layers.size() << " layers form reply, before to_ndarray";
 
-  for (const auto layer : layers) {
-    keys->emplace_back(layer.name());
-    to_ndarray(arrays, layer);
+  if (model_handle.sharing_granularity() == SharingGranularity_Model) {
+    auto ipc_open_span = start_span(
+        "cudaIpcOpenMemHandle",
+        span_category_ipc,
+        span_props{{"model", model_handle.name()}, {"byte_count", std::to_string(model_handle.byte_count())}});
+    auto base_device_ptr = get_device_ptr(model_handle.ipc_handle());
+    stop_span(ipc_open_span);
+
+    for (const auto layer : layers) {
+     //auto create_layer_span = start_span("to_nd_array",
+     //                                    span_category_serialization,
+     //                                    span_props{{"layer", layer.name()}, {"sharing_granularity", "model"}});
+
+      keys->emplace_back(layer.name());
+      const auto shape = to_shape(layer.shape());
+      auto device_ptr  = get_device_ptr_offset(layer, base_device_ptr);
+      TBlob blob(device_ptr, shape, dev_mask, dev_id);
+      arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
+
+      //stop_span(create_layer_span);
+    }
+    return;
+  }
+  if (model_handle.sharing_granularity() == SharingGranularity_Model) {
+    for (const auto layer : layers) {
+     //auto create_layer_span = start_span("to_nd_array",
+     //                                    span_category_serialization,
+     //                                    span_props{{"layer", layer.name()}, {"sharing_granularity", "layer"}});
+
+      keys->emplace_back(layer.name());
+      const auto shape = to_shape(layer.shape());
+      auto device_ptr  = get_device_ptr(layer);
+      TBlob blob(device_ptr, shape, dev_mask, dev_id);
+      arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
+
+      //stop_span(create_layer_span);
+    }
+    return;
   }
 
-  // LOG(INFO) << "finished nd_array conversion";
+  throw dmlc::Error("invalid granularity");
 
   return;
 }
@@ -158,6 +181,14 @@ struct client {
     ModelHandle Open(const std::string &model_name) {
       ModelRequest request;
       request.set_name(model_name);
+      if (UPR_SHARING_GRANULARITY == "model") {
+        request.set_sharing_granularity(SharingGranularity_Model);
+      } else if (UPR_SHARING_GRANULARITY == "layer") {
+        request.set_sharing_granularity(SharingGranularity_Layer);
+      } else {
+        throw dmlc::Error(
+            fmt::format("Error: [{}] {}. failed to determine model granularity.", UPR_SHARING_GRANULARITY));
+      }
       return this->Open(request);
     }
 
@@ -218,6 +249,7 @@ struct client {
                                       span_category_serialization,
                                       span_props{{"model_id", open_reply.model_id()},
                                                  {"byte_count", std::to_string(open_reply.byte_count())},
+                                                 {"needed_eviction", std::to_string(open_reply.needed_eviction())},
                                                  {"nlayers", std::to_string(open_reply.layer().size())}});
     defer(stop_span(span_converting));
 
@@ -247,5 +279,11 @@ void Unload(MXAPIPredictor *pred) {
   return;
 }
 
+void initialize() {
+    if (is_client && UPR_ENABLED) {
+        client::get_connection();
+    }
+}
+
 } // namespace upr
 #endif // MXNET_USE_CUDA
diff --git a/src/c_api/ipc.h b/src/c_api/ipc.h
@@ -78,6 +78,7 @@ static const auto UPR_BASE_DIR   = dmlc::GetEnv("UPR_BASE_DIR", HOME + std::stri
 
 static const auto UPR_ENABLE_MEMORY_PROFILE = dmlc::GetEnv("UPR_ENABLE_MEMORY_PROFILE", false);
 static const auto UPR_ENABLE_CUDA_FREE      = dmlc::GetEnv("UPR_ENABLE_CUDA_FREE", false);
+static const auto UPR_SHARING_GRANULARITY   = dmlc::GetEnv("UPR_SHARING_GRANULARITY", std::string("model"));
 
 static const auto UPRD_EVICTION_POLICY               = dmlc::GetEnv("UPRD_EVICTION_POLICY", std::string("lru"));
 static const auto UPRD_ESTIMATION_RATE               = dmlc::GetEnv("UPRD_ESTIMATION_RATE", 1.0);
@@ -239,8 +240,8 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri
 #if MXNET_USE_PROFILER
   const auto ctx = get_ctx();
   auto opr_stat  = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
-  uint64_t tid   = std::hash<std::thread::id>()(std::this_thread::get_id());
-  engine::SetOprCategory(opr_stat, category);
+  // uint64_t tid   = std::hash<std::thread::id>()(std::this_thread::get_id());
+  opr_stat->category = category;
   engine::SetOprStart(opr_stat);
   return opr_stat;
 #else
@@ -250,11 +251,13 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri
 
 static inline engine::OprExecStat *start_span(const std::string &name, std::string category, span_props props) {
 #if MXNET_USE_PROFILER
-  auto span = start_span(name, category);
-  for (const auto kv : props) {
-    engine::AddOprMetadata(span, kv.first, kv.second);
-  }
-  return span;
+  const auto ctx = get_ctx();
+  auto opr_stat  = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
+  // uint64_t tid   = std::hash<std::thread::id>()(std::this_thread::get_id());
+  opr_stat->category = category;
+  opr_stat->metadata = props;
+  engine::SetOprStart(opr_stat);
+  return opr_stat;
 #else
   return nullptr;
 #endif
@@ -443,5 +446,7 @@ void Unload(mxnet::MXAPIPredictor *pred);
 
 std::pair<std::string, std::string> Load(std::string model_name, std::vector<mxnet::NDArray> *data,
                                          std::vector<std::string> *keys);
+
+void initialize();
 } // namespace upr
 #endif // MXNET_USE_CUDA