Skip to content

Commit

Permalink
Merge pull request #4 from rai-project/feature/persistent_granularity
Browse files Browse the repository at this point in the history
Feature/persistent granularity
  • Loading branch information
abdul dakkak authored Apr 11, 2018
2 parents ffd91b2 + 2b744ca commit 0f2d163
Show file tree
Hide file tree
Showing 19 changed files with 1,320 additions and 261 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ The server is part of the MXNet build process.
| UPR_INPUT_MEAN_B | | 0 |
| UPR_ENABLE_MEMORY_PROFILE | | false |
| UPR_ENABLE_CUDA_FREE | | false |
| UPR_SHARING_GRANULARITY | | model |
| -------------------------- | ----------- | ------------- |
| UPRD_EVICTION_POLICY | | LRU |
| UPRD_ESTIMATION_RATE | | 1.0 |
Expand Down
6 changes: 3 additions & 3 deletions config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export NVCC = nvcc
DEV = 0

# whether compile with debug
DEBUG = 0
0EBUG = 0

# whether compile with profiler
USE_PROFILER =
Expand Down Expand Up @@ -174,7 +174,7 @@ USE_S3 = 0
# performance settings
#----------------------------
# Use operator tuning
USE_OPERATOR_TUNING = 1
USE_OPERATOR_TUNING = 0

# Use gperftools if found
USE_GPERFTOOLS = 1
Expand Down Expand Up @@ -225,7 +225,7 @@ USE_PROFILER=1
# whether compile with options for MXNet developer
DEV = 0

DEBUG = 1
DEBUG = 0
USE_GLOG=1
USE_OPERATOR_TUNING = 0
USE_OPENMP = 0
Expand Down
9 changes: 6 additions & 3 deletions example/image-classification/predict-cpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,19 @@ CFLAGS+=-DUSE_CUDNN=1
CFLAGS+=-DMSHADOW_USE_CUDNN=1
CFLAGS+=-DNO_OPENMP=1
CFLAGS+=-DUSE_CUDA=1
CFLAGS+=-DUSE_GPERFTOOLS=1
CFLAGS+=-Xcompiler -fno-builtin-malloc,-fno-builtin-calloc,-fno-builtin-realloc,-fno-builtin-free
CFLAGS+=-DMXNET_USE_CUDA=1
CFLAGS+= -Xcompiler -finstrument-functions
#CFLAGS+= -Xcompiler -finstrument-functions
LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so -Xcompiler -finstrument-functions
LDFLAGS+=-ltcmalloc

image-classification-predict: image-classification-predict.o
nvcc -O3 -o image-classification-predict image-classification-predict.o $(LDFLAGS)
nvcc -O3 -g -o image-classification-predict image-classification-predict.o $(LDFLAGS)

image-classification-predict.o: image-classification-predict.cc
echo "CFLAGS = " $(CFLAGS)
nvcc -O3 -c image-classification-predict.cc $(CFLAGS)
nvcc -O3 -g -c image-classification-predict.cc $(CFLAGS)

clean:
rm -f image-classification-predict
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,10 @@ int main(int argc, char *argv[]) {
MXSetProfilerState(0);

// // Synset path for your model, you have to modify it
// std::vector<std::string> synset = LoadSynset(synset_file);
// std::vector<std::string> synset = LoadSynset(synset_file);

// // Print Output Data
// PrintOutputResult(data, size, synset);
// PrintOutputResult(data, size, synset);

return 0;
}
10 changes: 6 additions & 4 deletions example/image-classification/predict-cpp/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ export GLOG_logtostderr=1

export UPR_ENABLED=true
export UPR_CLIENT=1
# export UPR_INITIALIZE_EAGER=true
export UPR_INITIALIZE_EAGER=true
# export UPR_ENABLE_MEMORY_PROFILE=true

UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
# UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
#UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
#UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
UPR_MODEL_NAME=vgg16_1.0 ./image-classification-predict
#UPR_MODEL_NAME=squeezenet_1.0 ./image-classification-predict
#UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict
# UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &
Expand Down
138 changes: 88 additions & 50 deletions src/c_api/ipc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,83 +27,106 @@ std::string server::host_name = "localhost";
int server::port = dmlc::GetEnv("PORT", 50051);
std::string server::address = fmt::format("{}:{}", host_name, port);


static TShape to_shape(Shape shape) {
auto dim = shape.dim();
TShape res(dim.begin(), dim.end());
return res;
}

static void *get_device_ptr(const Layer &layer) {
const auto ipc_handle = layer.ipc_handle();
if (ipc_handle == "") {
const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", ipc_handle);
static void *get_device_ptr_offset(const Layer &layer, void *devPtr) {
const auto offset = layer.offset();
return (void *) (((char *) (devPtr)) + offset);
}

static void *get_device_ptr(const std::string &handle_bytes) {
if (handle_bytes == "") {
const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", handle_bytes);
LOG(FATAL) << msg;
throw dmlc::Error(msg);
}

cudaIpcMemHandle_t handle;
memcpy((uint8_t *) &handle, ipc_handle.c_str(), sizeof(handle));
memcpy((uint8_t *) &handle, handle_bytes.c_str(), sizeof(handle));

// LOG(INFO) << "get handle = " << handle << "get base64 handle = " << utils::base64_encode(ipc_handle);
void *device_ptr = nullptr;
CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(handle_bytes)));

auto name = layer.name();
return device_ptr;
}

static const std::string arg_prefix("arg:");
if (string_starts_with(name, arg_prefix)) {
name.erase(0, arg_prefix.size());
}
static const std::string aux_prefix("aux:");
if (string_starts_with(name, aux_prefix)) {
name.erase(0, aux_prefix.size());
}
static void *get_device_ptr(const Layer &layer) {
auto name = layer.name();
const auto ipc_handle = layer.ipc_handle();

void *device_ptr;
auto span = start_span("cudaIpcOpenMemHandle", span_category_ipc, span_props{{"layer", name},
{"byte_count", std::to_string(layer.byte_count())}});
CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(ipc_handle)));
stop_span(span);
static const std::string arg_prefix("arg:");
if (string_starts_with(name, arg_prefix)) {
name.erase(0, arg_prefix.size());
}
static const std::string aux_prefix("aux:");
if (string_starts_with(name, aux_prefix)) {
name.erase(0, aux_prefix.size());
}

// LOG(INFO) << "get device_ptr = " << device_ptr;
auto span = start_span("cudaIpcOpenMemHandle",
span_category_ipc,
span_props{{"layer", name}, {"byte_count", std::to_string(layer.byte_count())}});
auto device_ptr = get_device_ptr(ipc_handle.c_str());
stop_span(span);

return device_ptr;
}

static void to_ndarray(std::vector<NDArray> *arrays, const Layer &layer) {
const auto ctx = get_ctx();

auto span = start_span("to_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
defer(stop_span(span));

const auto shape = to_shape(layer.shape());
static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &model_handle) {
const auto ctx = get_ctx();
const auto dev_mask = ctx.dev_mask();
const auto dev_id = ctx.dev_id;

// LOG(INFO) << "in layer=" << layer.name() << " getting device ptr using ctx = " << ctx;

auto device_ptr = get_device_ptr(layer);

auto span_creating =
start_span("creating_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
defer(stop_span(span_creating));

TBlob blob(device_ptr, shape, dev_mask, dev_id);
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);

return;
}

static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &reply) {
const auto layers = reply.layer();
const auto layers = model_handle.layer();

// LOG(INFO) << "got " << layers.size() << " layers form reply, before to_ndarray";

for (const auto layer : layers) {
keys->emplace_back(layer.name());
to_ndarray(arrays, layer);
if (model_handle.sharing_granularity() == SharingGranularity_Model) {
auto ipc_open_span = start_span(
"cudaIpcOpenMemHandle",
span_category_ipc,
span_props{{"model", model_handle.name()}, {"byte_count", std::to_string(model_handle.byte_count())}});
auto base_device_ptr = get_device_ptr(model_handle.ipc_handle());
stop_span(ipc_open_span);

for (const auto layer : layers) {
//auto create_layer_span = start_span("to_nd_array",
// span_category_serialization,
// span_props{{"layer", layer.name()}, {"sharing_granularity", "model"}});

keys->emplace_back(layer.name());
const auto shape = to_shape(layer.shape());
auto device_ptr = get_device_ptr_offset(layer, base_device_ptr);
TBlob blob(device_ptr, shape, dev_mask, dev_id);
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);

//stop_span(create_layer_span);
}
return;
}
if (model_handle.sharing_granularity() == SharingGranularity_Model) {
for (const auto layer : layers) {
//auto create_layer_span = start_span("to_nd_array",
// span_category_serialization,
// span_props{{"layer", layer.name()}, {"sharing_granularity", "layer"}});

keys->emplace_back(layer.name());
const auto shape = to_shape(layer.shape());
auto device_ptr = get_device_ptr(layer);
TBlob blob(device_ptr, shape, dev_mask, dev_id);
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);

//stop_span(create_layer_span);
}
return;
}

// LOG(INFO) << "finished nd_array conversion";
throw dmlc::Error("invalid granularity");

return;
}
Expand Down Expand Up @@ -158,6 +181,14 @@ struct client {
ModelHandle Open(const std::string &model_name) {
ModelRequest request;
request.set_name(model_name);
if (UPR_SHARING_GRANULARITY == "model") {
request.set_sharing_granularity(SharingGranularity_Model);
} else if (UPR_SHARING_GRANULARITY == "layer") {
request.set_sharing_granularity(SharingGranularity_Layer);
} else {
throw dmlc::Error(
fmt::format("Error: [{}] {}. failed to determine model granularity.", UPR_SHARING_GRANULARITY));
}
return this->Open(request);
}

Expand Down Expand Up @@ -218,6 +249,7 @@ struct client {
span_category_serialization,
span_props{{"model_id", open_reply.model_id()},
{"byte_count", std::to_string(open_reply.byte_count())},
{"needed_eviction", std::to_string(open_reply.needed_eviction())},
{"nlayers", std::to_string(open_reply.layer().size())}});
defer(stop_span(span_converting));

Expand Down Expand Up @@ -247,5 +279,11 @@ void Unload(MXAPIPredictor *pred) {
return;
}

void initialize() {
if (is_client && UPR_ENABLED) {
client::get_connection();
}
}

} // namespace upr
#endif // MXNET_USE_CUDA
19 changes: 12 additions & 7 deletions src/c_api/ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static const auto UPR_BASE_DIR = dmlc::GetEnv("UPR_BASE_DIR", HOME + std::stri

static const auto UPR_ENABLE_MEMORY_PROFILE = dmlc::GetEnv("UPR_ENABLE_MEMORY_PROFILE", false);
static const auto UPR_ENABLE_CUDA_FREE = dmlc::GetEnv("UPR_ENABLE_CUDA_FREE", false);
static const auto UPR_SHARING_GRANULARITY = dmlc::GetEnv("UPR_SHARING_GRANULARITY", std::string("model"));

static const auto UPRD_EVICTION_POLICY = dmlc::GetEnv("UPRD_EVICTION_POLICY", std::string("lru"));
static const auto UPRD_ESTIMATION_RATE = dmlc::GetEnv("UPRD_ESTIMATION_RATE", 1.0);
Expand Down Expand Up @@ -239,8 +240,8 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri
#if MXNET_USE_PROFILER
const auto ctx = get_ctx();
auto opr_stat = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
engine::SetOprCategory(opr_stat, category);
// uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
opr_stat->category = category;
engine::SetOprStart(opr_stat);
return opr_stat;
#else
Expand All @@ -250,11 +251,13 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri

static inline engine::OprExecStat *start_span(const std::string &name, std::string category, span_props props) {
#if MXNET_USE_PROFILER
auto span = start_span(name, category);
for (const auto kv : props) {
engine::AddOprMetadata(span, kv.first, kv.second);
}
return span;
const auto ctx = get_ctx();
auto opr_stat = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
// uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
opr_stat->category = category;
opr_stat->metadata = props;
engine::SetOprStart(opr_stat);
return opr_stat;
#else
return nullptr;
#endif
Expand Down Expand Up @@ -443,5 +446,7 @@ void Unload(mxnet::MXAPIPredictor *pred);

std::pair<std::string, std::string> Load(std::string model_name, std::vector<mxnet::NDArray> *data,
std::vector<std::string> *keys);

void initialize();
} // namespace upr
#endif // MXNET_USE_CUDA
Loading

0 comments on commit 0f2d163

Please sign in to comment.